]> git.donarmstrong.com Git - mothur.git/blob - bayesian.cpp
optimizing classify.seqs calculating of template probabilities.
[mothur.git] / bayesian.cpp
1 /*
2  *  bayesian.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "bayesian.h"
11 #include "kmer.hpp"
12 #include "phylosummary.h"
13 #include "referencedb.h"
14 /**************************************************************************************************/
15 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh) : 
16 Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
17         try {
18                 ReferenceDB* rdb = ReferenceDB::getInstance();
19                 
20                 threadID = tid;
21                 flip = f;
22         shortcuts = sh;
23                 string baseName = tempFile;
24                         
25                 if (baseName == "saved") { baseName = rdb->getSavedReference(); }
26                 
27                 string baseTName = tfile;
28                 if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); }
29                 
30                 /************calculate the probablity that each word will be in a specific taxonomy*************/
31                 string tfileroot = m->getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
32                 string tempfileroot = m->getRootName(m->getSimpleName(baseName));
33                 string phyloTreeName = tfileroot + "tree.train";
34                 string phyloTreeSumName = tfileroot + "tree.sum";
35                 string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
36                 string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
37                 
38                 ofstream out;
39                 ofstream out2;
40                 
41                 ifstream phyloTreeTest(phyloTreeName.c_str());
42                 ifstream probFileTest2(probFileName2.c_str());
43                 ifstream probFileTest(probFileName.c_str());
44                 ifstream probFileTest3(phyloTreeSumName.c_str());
45                 
46                 int start = time(NULL);
47                 
48                 //if they are there make sure they were created after this release date
49                 bool FilesGood = false;
50                 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){
51                         FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3);
52                 }
53                 
54                 //if you want to save, but you dont need to calculate then just read
55                 if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) {  
56                         ifstream saveIn;
57                         m->openInputFile(tempFile, saveIn);
58                         
59                         while (!saveIn.eof()) {
60                                 Sequence temp(saveIn);
61                                 m->gobble(saveIn);
62                                 
63                                 rdb->referenceSeqs.push_back(temp); 
64                         }
65                         saveIn.close();                 
66                 }
67
68                 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){       
69                         if (tempFile == "saved") { m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");     m->mothurOutEndLine(); }
70                         
71                         m->mothurOut("Reading template taxonomy...     "); cout.flush();
72                         
73                         phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
74                         
75                         m->mothurOut("DONE."); m->mothurOutEndLine();
76                         
77                         genusNodes = phyloTree->getGenusNodes(); 
78                         genusTotals = phyloTree->getGenusTotals();
79                         
80                         if (tfile == "saved") { 
81                                 m->mothurOutEndLine();  m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory...    ");        cout.flush();; 
82                                 wordGenusProb = rdb->wordGenusProb;
83                                 WordPairDiffArr = rdb->WordPairDiffArr;
84                         }else {
85                                 m->mothurOut("Reading template probabilities...     "); cout.flush();
86                                 readProbFile(probFileTest, probFileTest2, probFileName, probFileName2);
87                         }       
88                         
89                         //save probabilities
90                         if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
91                 }else{
92                 
93                         //create search database and names vector
94                         generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
95                         
96                         //prevents errors caused by creating shortcut files if you had an error in the sanity check.
97                         if (m->control_pressed) {  m->mothurRemove(phyloTreeName);  m->mothurRemove(probFileName); m->mothurRemove(probFileName2); }
98                         else{ 
99                                 genusNodes = phyloTree->getGenusNodes(); 
100                                 genusTotals = phyloTree->getGenusTotals();
101                                 
102                                 m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
103                                 
104                                 phyloTree->printTreeNodes(phyloTreeName);
105                                                         
106                                 m->mothurOut("DONE."); m->mothurOutEndLine();
107                                 
108                                 m->mothurOut("Calculating template probabilities...     "); cout.flush();
109                                 
110                                 numKmers = database->getMaxKmer() + 1;
111                         
112                                 //initialze probabilities
113                                 wordGenusProb.resize(numKmers);
114                                 WordPairDiffArr.resize(numKmers);
115                         
116                                 for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
117                 ofstream out;
118                                 ofstream out2;
119                                 
120                                 #ifdef USE_MPI
121                                         int pid;
122                                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
123
124                                         if (pid == 0) {  
125                                 #endif
126
127                                 
128                 if (shortcuts) { 
129                     m->openOutputFile(probFileName, out); 
130                                 
131                     //output mothur version
132                     out << "#" << m->getVersion() << endl;
133                                 
134                     out << numKmers << endl;
135                                 
136                     m->openOutputFile(probFileName2, out2);
137                                 
138                     //output mothur version
139                     out2 << "#" << m->getVersion() << endl;
140                 }
141                                 
142                                 #ifdef USE_MPI
143                                         }
144                                 #endif
145
146                                 //for each word
147                                 for (int i = 0; i < numKmers; i++) {
148                                         if (m->control_pressed) {  break; }
149                                         
150                                         #ifdef USE_MPI
151                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
152
153                                                 if (pid == 0) {  
154                                         #endif
155
156                     if (shortcuts) {  out << i << '\t'; }
157                                         
158                                         #ifdef USE_MPI
159                                                 }
160                                         #endif
161                                         
162                                         vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
163                                         
164                                         //for each sequence with that word
165                     vector<int> count; count.resize(genusNodes.size(), 0);
166                                         for (int j = 0; j < seqsWithWordi.size(); j++) {
167                                                 int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
168                                                 count[temp]++;  //increment count of seq in this genus who have this word
169                                         }
170                                         
171                                         //probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1);
172                                         float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
173                                         diffPair tempProb(log(probabilityInTemplate), 0.0);
174                                         WordPairDiffArr[i] = tempProb;
175                                                 
176                                         int numNotZero = 0;
177                                         for (int k = 0; k < genusNodes.size(); k++) {
178                                                 //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
179                                                 
180                                                 
181                                                 wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
182                                                                         
183                                                 if (count[k] != 0) { 
184                                                         #ifdef USE_MPI
185                                                                 int pid;
186                                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
187                                                 
188                                                                 if (pid == 0) {  
189                                                         #endif
190
191                             if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; }
192                                                         
193                                                         #ifdef USE_MPI
194                                                                 }
195                                                         #endif
196
197                                                         numNotZero++;  
198                                                 }
199                                         }
200                                         
201                                         #ifdef USE_MPI
202                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
203                                 
204                                                 if (pid == 0) {  
205                                         #endif
206                                         
207                             if (shortcuts) { 
208                                 out << endl;
209                                 out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
210                             }
211                                         
212                                         #ifdef USE_MPI
213                                                 }
214                                         #endif
215                                 }
216                                 
217                                 #ifdef USE_MPI
218                                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
219                                 
220                                         if (pid == 0) {  
221                                 #endif
222                                 
223                         if (shortcuts) { 
224                             out.close();
225                             out2.close();
226                         }
227                                 #ifdef USE_MPI
228                                         }
229                                 #endif
230                                 
231                                 //read in new phylotree with less info. - its faster
232                                 ifstream phyloTreeTest(phyloTreeName.c_str());
233                                 delete phyloTree;
234                                 
235                                 phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
236                 
237                                 //save probabilities
238                                 if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
239                         }
240                 }
241                 
242                 generateWordPairDiffArr();
243                 
244                 //save probabilities
245                 if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
246                 
247                 m->mothurOut("DONE."); m->mothurOutEndLine();
248                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
249         }
250         catch(exception& e) {
251                 m->errorOut(e, "Bayesian", "Bayesian");
252                 exit(1);
253         }
254 }
255 /**************************************************************************************************/
256 Bayesian::~Bayesian() {
257         try {
258                 
259                  delete phyloTree; 
260                  if (database != NULL) {  delete database; }
261         }
262         catch(exception& e) {
263                 m->errorOut(e, "Bayesian", "~Bayesian");
264                 exit(1);
265         }
266 }
267
268 /**************************************************************************************************/
269 string Bayesian::getTaxonomy(Sequence* seq) {
270         try {
271                 string tax = "";
272                 Kmer kmer(kmerSize);
273                 flipped = false;
274                 
275                 //get words contained in query
276                 //getKmerString returns a string where the index in the string is hte kmer number 
277                 //and the character at that index can be converted to be the number of times that kmer was seen
278                 string queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
279                 
280                 vector<int> queryKmers;
281                 for (int i = 0; i < queryKmerString.length()-1; i++) {  // the -1 is to ignore any kmer with an N in it
282                         if (queryKmerString[i] != '!') { //this kmer is in the query
283                                 queryKmers.push_back(i);
284                         }
285                 }
286                 
287                 //if user wants to test reverse compliment and its reversed use that instead
288                 if (flip) {     
289                         if (isReversed(queryKmers)) { 
290                                 flipped = true;
291                                 seq->reverseComplement(); 
292                                 queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
293                                 queryKmers.clear();
294                                 for (int i = 0; i < queryKmerString.length()-1; i++) {  // the -1 is to ignore any kmer with an N in it
295                                         if (queryKmerString[i] != '!') { //this kmer is in the query
296                                                 queryKmers.push_back(i);
297                                         }
298                                 }
299                         }  
300                 }
301                 
302                 if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); simpleTax = "unknown;";  return "unknown;"; }
303                 
304                 
305                 int index = getMostProbableTaxonomy(queryKmers);
306                 
307                 if (m->control_pressed) { return tax; }
308                                         
309                 //bootstrap - to set confidenceScore
310                 int numToSelect = queryKmers.size() / 8;
311         
312                 tax = bootstrapResults(queryKmers, index, numToSelect);
313                 
314                 return tax;     
315         }
316         catch(exception& e) {
317                 m->errorOut(e, "Bayesian", "getTaxonomy");
318                 exit(1);
319         }
320 }
321 /**************************************************************************************************/
322 string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
323         try {
324                                 
325                 map<int, int> confidenceScores; 
326                 
327                 //initialize confidences to 0 
328                 int seqIndex = tax;
329                 TaxNode seq = phyloTree->get(tax);
330                 confidenceScores[tax] = 0;
331                 
332                 while (seq.level != 0) { //while you are not at the root
333                         seqIndex = seq.parent;
334                         confidenceScores[seqIndex] = 0;
335                         seq = phyloTree->get(seq.parent);
336                 }
337                                 
338                 map<int, int>::iterator itBoot;
339                 map<int, int>::iterator itBoot2;
340                 map<int, int>::iterator itConvert;
341                         
342                 for (int i = 0; i < iters; i++) {
343                         if (m->control_pressed) { return "control"; }
344                         
345                         vector<int> temp;
346                         for (int j = 0; j < numToSelect; j++) {
347                                 int index = int(rand() % kmers.size());
348                                 
349                                 //add word to temp
350                                 temp.push_back(kmers[index]);
351                         }
352                         
353                         //get taxonomy
354                         int newTax = getMostProbableTaxonomy(temp);
355                         //int newTax = 1;
356                         TaxNode taxonomyTemp = phyloTree->get(newTax);
357                         
358                         //add to confidence results
359                         while (taxonomyTemp.level != 0) { //while you are not at the root
360                                 itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
361                                 
362                                 if (itBoot2 != confidenceScores.end()) { //this is a classification we need a confidence for
363                                         (itBoot2->second)++;
364                                 }
365                                 
366                                 newTax = taxonomyTemp.parent;
367                                 taxonomyTemp = phyloTree->get(newTax);
368                         }
369         
370                 }
371                 
372                 string confidenceTax = "";
373                 simpleTax = "";
374                 
375                 int seqTaxIndex = tax;
376                 TaxNode seqTax = phyloTree->get(tax);
377                 
378                 while (seqTax.level != 0) { //while you are not at the root
379                                         
380                                 itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
381                                 
382                                 int confidence = 0;
383                                 if (itBoot2 != confidenceScores.end()) { //already in confidence scores
384                                         confidence = itBoot2->second;
385                                 }
386                                 
387                                 if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
388                                         confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
389                                         simpleTax = seqTax.name + ";" + simpleTax;
390                                 }
391                                 
392                                 seqTaxIndex = seqTax.parent;
393                                 seqTax = phyloTree->get(seqTax.parent);
394                 }
395                 
396                 if (confidenceTax == "") { confidenceTax = "unknown;"; simpleTax = "unknown;";  }
397         
398                 return confidenceTax;
399                 
400         }
401         catch(exception& e) {
402                 m->errorOut(e, "Bayesian", "bootstrapResults");
403                 exit(1);
404         }
405 }
406 /**************************************************************************************************/
407 int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
408         try {
409                 int indexofGenus = 0;
410                 
411                 double maxProbability = -1000000.0;
412                 //find taxonomy with highest probability that this sequence is from it
413                 
414                 
415 //              cout << genusNodes.size() << endl;
416                 
417                 
418                 for (int k = 0; k < genusNodes.size(); k++) {
419                         //for each taxonomy calc its probability
420                         
421                         double prob = 0.0000;
422                         for (int i = 0; i < queryKmer.size(); i++) {
423                                 prob += wordGenusProb[queryKmer[i]][k];
424                         }
425                         
426 //                      cout << phyloTree->get(genusNodes[k]).name << '\t' << prob << endl;
427
428                         //is this the taxonomy with the greatest probability?
429                         if (prob > maxProbability) { 
430                                 indexofGenus = genusNodes[k];
431                                 maxProbability = prob;
432                         }
433                 }
434                 
435                         
436                 return indexofGenus;
437         }
438         catch(exception& e) {
439                 m->errorOut(e, "Bayesian", "getMostProbableTaxonomy");
440                 exit(1);
441         }
442 }
443 //********************************************************************************************************************
444 //if it is more probable that the reverse compliment kmers are in the template, then we assume the sequence is reversed.
445 bool Bayesian::isReversed(vector<int>& queryKmers){
446         try{
447                 bool reversed = false;
448                 float prob = 0;
449                 float reverseProb = 0;
450                  
451         for (int i = 0; i < queryKmers.size(); i++){
452             int kmer = queryKmers[i];
453             if (kmer >= 0){
454                 prob += WordPairDiffArr[kmer].prob;
455                                 reverseProb += WordPairDiffArr[kmer].reverseProb;
456             }
457         }
458                 
459         if (reverseProb > prob){ reversed = true; }
460         
461                 return reversed;
462         }
463         catch(exception& e) {
464                 m->errorOut(e, "Bayesian", "isReversed");
465                 exit(1);
466         }
467 }
468 //********************************************************************************************************************
469 int Bayesian::generateWordPairDiffArr(){
470         try{
471                 Kmer kmer(kmerSize);
472                 for (int i = 0; i < WordPairDiffArr.size(); i++) {
473                         int reversedWord = kmer.getReverseKmerNumber(i);
474                         WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob;
475                 }
476                 
477                 return 0;
478         }catch(exception& e) {
479                 m->errorOut(e, "Bayesian", "generateWordPairDiffArr");
480                 exit(1);
481         }
482 }
483 /*************************************************************************************************
484 map<string, int> Bayesian::parseTaxMap(string newTax) {
485         try{
486         
487                 map<string, int> parsed;
488                 
489                 newTax = newTax.substr(0, newTax.length()-1);  //get rid of last ';'
490         
491                 //parse taxonomy
492                 string individual;
493                 while (newTax.find_first_of(';') != -1) {
494                         individual = newTax.substr(0,newTax.find_first_of(';'));
495                         newTax = newTax.substr(newTax.find_first_of(';')+1, newTax.length());
496                         parsed[individual] = 1;
497                 }
498                 
499                 //get last one
500                 parsed[newTax] = 1;
501
502                 return parsed;
503                 
504         }
505         catch(exception& e) {
506                 m->errorOut(e, "Bayesian", "parseTax");
507                 exit(1);
508         }
509 }
510 **************************************************************************************************/
511 void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
512         try{
513                 
514                 #ifdef USE_MPI
515                         
516                         int pid, num, num2, processors;
517                         vector<unsigned long long> positions;
518                         vector<unsigned long long> positions2;
519                         
520                         MPI_Status status; 
521                         MPI_File inMPI;
522                         MPI_File inMPI2;
523                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
524                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
525                         int tag = 2001;
526
527                         char inFileName[1024];
528                         strcpy(inFileName, inNumName.c_str());
529                         
530                         char inFileName2[1024];
531                         strcpy(inFileName2, inName.c_str());
532
533                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
534                         MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2);  //comm, filename, mode, info, filepointer
535
536                         if (pid == 0) {
537                                 positions = m->setFilePosEachLine(inNumName, num);
538                                 positions2 = m->setFilePosEachLine(inName, num2);
539                                 
540                                 for(int i = 1; i < processors; i++) { 
541                                         MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
542                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
543                                         
544                                         MPI_Send(&num2, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
545                                         MPI_Send(&positions2[0], (num2+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
546                                 }
547
548                         }else{
549                                 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
550                                 positions.resize(num+1);
551                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
552                                 
553                                 MPI_Recv(&num2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
554                                 positions2.resize(num2+1);
555                                 MPI_Recv(&positions2[0], (num2+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
556                         }
557                         
558                         //read version
559                         int length = positions2[1] - positions2[0];
560                         char* buf5 = new char[length];
561
562                         MPI_File_read_at(inMPI2, positions2[0], buf5, length, MPI_CHAR, &status);
563                         delete buf5;
564
565                         //read numKmers
566                         length = positions2[2] - positions2[1];
567                         char* buf = new char[length];
568
569                         MPI_File_read_at(inMPI2, positions2[1], buf, length, MPI_CHAR, &status);
570
571                         string tempBuf = buf;
572                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
573                         delete buf;
574
575                         istringstream iss (tempBuf,istringstream::in);
576                         iss >> numKmers;  
577                         
578                         //initialze probabilities
579                         wordGenusProb.resize(numKmers);
580                         
581                         for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
582                         
583                         int kmer, name;  
584                         vector<int> numbers; numbers.resize(numKmers);
585                         float prob;
586                         vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
587                         WordPairDiffArr.resize(numKmers);
588                         
589                         //read version
590                         length = positions[1] - positions[0];
591                         char* buf6 = new char[length];
592
593                         MPI_File_read_at(inMPI2, positions[0], buf6, length, MPI_CHAR, &status);
594                         delete buf6;
595                         
596                         //read file 
597                         for(int i=1;i<num;i++){
598                                 //read next sequence
599                                 length = positions[i+1] - positions[i];
600                                 char* buf4 = new char[length];
601
602                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
603
604                                 tempBuf = buf4;
605                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
606                                 delete buf4;
607
608                                 istringstream iss (tempBuf,istringstream::in);
609                                 float probTemp;
610                                 iss >> zeroCountProb[i] >> numbers[i] >> probTemp; 
611                                 WordPairDiffArr[i].prob = probTemp;
612
613                         }
614                         
615                         MPI_File_close(&inMPI);
616                         
617                         for(int i=2;i<num2;i++){
618                                 //read next sequence
619                                 length = positions2[i+1] - positions2[i];
620                                 char* buf4 = new char[length];
621
622                                 MPI_File_read_at(inMPI2, positions2[i], buf4, length, MPI_CHAR, &status);
623
624                                 tempBuf = buf4;
625                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
626                                 delete buf4;
627
628                                 istringstream iss (tempBuf,istringstream::in);
629                                 
630                                 iss >> kmer;
631                                 
632                                 //set them all to zero value
633                                 for (int i = 0; i < genusNodes.size(); i++) {
634                                         wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
635                                 }
636                                 
637                                 //get probs for nonzero values
638                                 for (int i = 0; i < numbers[kmer]; i++) {
639                                         iss >> name >> prob;
640                                         wordGenusProb[kmer][name] = prob;
641                                 }
642                                 
643                         }
644                         MPI_File_close(&inMPI2);
645                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
646                 #else
647                         //read version
648                         string line = m->getline(in); m->gobble(in);
649                         
650                         in >> numKmers; m->gobble(in);
651                         //cout << threadID << '\t' << line << '\t' << numKmers << &in << '\t' << &inNum << '\t' << genusNodes.size() << endl;
652                         //initialze probabilities
653                         wordGenusProb.resize(numKmers);
654                         
655                         for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
656                         
657                         int kmer, name, count;  count = 0;
658                         vector<int> num; num.resize(numKmers);
659                         float prob;
660                         vector<float> zeroCountProb; zeroCountProb.resize(numKmers);    
661                         WordPairDiffArr.resize(numKmers);
662                         
663                         //read version
664                         string line2 = m->getline(inNum); m->gobble(inNum);
665                         float probTemp;
666                 //cout << threadID << '\t' << line2 << '\t' << this << endl;    
667                         while (inNum) {
668                                 inNum >> zeroCountProb[count] >> num[count] >> probTemp; 
669                                 WordPairDiffArr[count].prob = probTemp;
670                                 count++;
671                                 m->gobble(inNum);
672                                 //cout << threadID << '\t' << count << endl;
673                         }
674                         inNum.close();
675                 //cout << threadID << '\t' << "here1 " << &wordGenusProb << '\t' << &num << endl; //
676                 //cout << threadID << '\t' << &genusTotals << '\t' << endl; 
677                 //cout << threadID << '\t' << genusNodes.size() << endl;
678                         while(in) {
679                                 in >> kmer;
680                         //cout << threadID << '\t' << kmer << endl;
681                                 //set them all to zero value
682                                 for (int i = 0; i < genusNodes.size(); i++) {
683                                         wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
684                                 }
685                         //cout << threadID << '\t' << num[kmer] << "here" << endl;      
686                                 //get probs for nonzero values
687                                 for (int i = 0; i < num[kmer]; i++) {
688                                         in >> name >> prob;
689                                         wordGenusProb[kmer][name] = prob;
690                                 }
691                                 
692                                 m->gobble(in);
693                         }
694                         in.close();
695                 //cout << threadID << '\t' << "here" << endl;   
696                 #endif
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "Bayesian", "readProbFile");
700                 exit(1);
701         }
702 }
703 /**************************************************************************************************/
704 bool Bayesian::checkReleaseDate(ifstream& file1, ifstream& file2, ifstream& file3, ifstream& file4) {
705         try {
706                 
707                 bool good = true;
708                 
709                 vector<string> lines;
710                 lines.push_back(m->getline(file1));  
711                 lines.push_back(m->getline(file2)); 
712                 lines.push_back(m->getline(file3)); 
713                 lines.push_back(m->getline(file4)); 
714
715                 //before we added this check
716                 if ((lines[0][0] != '#') || (lines[1][0] != '#') || (lines[2][0] != '#') || (lines[3][0] != '#')) {  good = false;  }
717                 else {
718                         //rip off #
719                         for (int i = 0; i < lines.size(); i++) { lines[i] = lines[i].substr(1);  }
720                         
721                         //get mothurs current version
722                         string version = m->getVersion();
723                         
724                         vector<string> versionVector;
725                         m->splitAtChar(version, versionVector, '.');
726                         
727                         //check each files version
728                         for (int i = 0; i < lines.size(); i++) { 
729                                 vector<string> linesVector;
730                                 m->splitAtChar(lines[i], linesVector, '.');
731                         
732                                 if (versionVector.size() != linesVector.size()) { good = false; break; }
733                                 else {
734                                         for (int j = 0; j < versionVector.size(); j++) {
735                                                 int num1, num2;
736                                                 convert(versionVector[j], num1);
737                                                 convert(linesVector[j], num2);
738                                                 
739                                                 //if mothurs version is newer than this files version, then we want to remake it
740                                                 if (num1 > num2) {  good = false; break;  }
741                                         }
742                                 }
743                                 
744                                 if (!good) { break; }
745                         }
746                 }
747                 
748                 if (!good) {  file1.close(); file2.close(); file3.close(); file4.close();  }
749                 else { file1.seekg(0);  file2.seekg(0);  file3.seekg(0);  file4.seekg(0);  }
750                 
751                 return good;
752         }
753         catch(exception& e) {
754                 m->errorOut(e, "Bayesian", "checkReleaseDate");
755                 exit(1);
756         }
757 }
758 /**************************************************************************************************/
759
760
761
762
763
764