]> git.donarmstrong.com Git - mothur.git/blob - bayesian.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / bayesian.cpp
1 /*
2  *  bayesian.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "bayesian.h"
11 #include "kmer.hpp"
12 #include "phylosummary.h"
13
14 /**************************************************************************************************/
15 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
16 Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
17         try {
18                                         
19                 /************calculate the probablity that each word will be in a specific taxonomy*************/
20                 string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1);
21                 string tempfileroot = m->getRootName(m->getSimpleName(tempFile));
22                 string phyloTreeName = tfileroot + "tree.train";
23                 string phyloTreeSumName = tfileroot + "tree.sum";
24                 string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
25                 string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
26                 
27                 ofstream out;
28                 ofstream out2;
29                 
30                 ifstream phyloTreeTest(phyloTreeName.c_str());
31                 ifstream probFileTest2(probFileName2.c_str());
32                 ifstream probFileTest(probFileName.c_str());
33                 ifstream probFileTest3(phyloTreeSumName.c_str());
34                 
35                 int start = time(NULL);
36                 
37                 //if they are there make sure they were created after this release date
38                 bool FilesGood = false;
39                 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){
40                         FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3);
41                 }
42                 
43                 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){       
44                         m->mothurOut("Reading template taxonomy...     "); cout.flush();
45                         
46                         phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
47                         
48                         m->mothurOut("DONE."); m->mothurOutEndLine();
49                         
50                         genusNodes = phyloTree->getGenusNodes(); 
51                         genusTotals = phyloTree->getGenusTotals();
52                 
53                         m->mothurOut("Reading template probabilities...     "); cout.flush();
54                         readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); 
55                         
56                 }else{
57                 
58                         //create search database and names vector
59                         generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
60                         
61                         //prevents errors caused by creating shortcut files if you had an error in the sanity check.
62                         if (m->control_pressed) {  remove(phyloTreeName.c_str());  remove(probFileName.c_str()); remove(probFileName2.c_str()); }
63                         else{ 
64                                 genusNodes = phyloTree->getGenusNodes(); 
65                                 genusTotals = phyloTree->getGenusTotals();
66                                 
67                                 m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
68                                 
69                                 phyloTree->printTreeNodes(phyloTreeName);
70                                                         
71                                 m->mothurOut("DONE."); m->mothurOutEndLine();
72                                 
73                                 m->mothurOut("Calculating template probabilities...     "); cout.flush();
74                                 
75                                 numKmers = database->getMaxKmer() + 1;
76                         
77                                 //initialze probabilities
78                                 wordGenusProb.resize(numKmers);
79                         cout << numKmers << '\t' << genusNodes.size() << endl;
80                                 for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
81                         cout << numKmers << '\t' << genusNodes.size() << endl;  
82                                 ofstream out;
83                                 ofstream out2;
84                                 
85                                 #ifdef USE_MPI
86                                         int pid;
87                                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
88
89                                         if (pid == 0) {  
90                                 #endif
91
92                                 
93                                 m->openOutputFile(probFileName, out);
94                                 
95                                 //output mothur version
96                                 out << "#" << m->getVersion() << endl;
97                                 
98                                 out << numKmers << endl;
99                                 
100                                 m->openOutputFile(probFileName2, out2);
101                                 
102                                 //output mothur version
103                                 out2 << "#" << m->getVersion() << endl;
104                                 
105                                 #ifdef USE_MPI
106                                         }
107                                 #endif
108
109                                 
110                                 //for each word
111                                 for (int i = 0; i < numKmers; i++) {
112                                         if (m->control_pressed) {  break; }
113                                         
114                                         #ifdef USE_MPI
115                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
116
117                                                 if (pid == 0) {  
118                                         #endif
119
120                                         out << i << '\t';
121                                         
122                                         #ifdef USE_MPI
123                                                 }
124                                         #endif
125                                         
126                                         vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
127                                         
128                                         map<int, int> count;
129                                         for (int k = 0; k < genusNodes.size(); k++) {  count[genusNodes[k]] = 0;  }                     
130                                                         
131                                         //for each sequence with that word
132                                         for (int j = 0; j < seqsWithWordi.size(); j++) {
133                                                 int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
134                                                 count[temp]++;  //increment count of seq in this genus who have this word
135                                         }
136                                         
137                                         //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
138                                         float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
139                                         
140                                         int numNotZero = 0;
141                                         for (int k = 0; k < genusNodes.size(); k++) {
142                                                 //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
143                                                 wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
144                                                 if (count[genusNodes[k]] != 0) { 
145                                                         #ifdef USE_MPI
146                                                                 int pid;
147                                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
148                                                 
149                                                                 if (pid == 0) {  
150                                                         #endif
151
152                                                         out << k << '\t' << wordGenusProb[i][k] << '\t'; 
153                                                         
154                                                         #ifdef USE_MPI
155                                                                 }
156                                                         #endif
157
158                                                         numNotZero++;  
159                                                 }
160                                         }
161                                         
162                                         #ifdef USE_MPI
163                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
164                                 
165                                                 if (pid == 0) {  
166                                         #endif
167                                         
168                                         out << endl;
169                                         out2 << probabilityInTemplate << '\t' << numNotZero << endl;
170                                         
171                                         #ifdef USE_MPI
172                                                 }
173                                         #endif
174                                 }
175                                 
176                                 #ifdef USE_MPI
177                                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
178                                 
179                                         if (pid == 0) {  
180                                 #endif
181                                 
182                                 out.close();
183                                 out2.close();
184                                 
185                                 #ifdef USE_MPI
186                                         }
187                                 #endif
188                                 
189                                 //read in new phylotree with less info. - its faster
190                                 ifstream phyloTreeTest(phyloTreeName.c_str());
191                                 delete phyloTree;
192                                 
193                                 phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
194                         }
195                 }
196         
197                 m->mothurOut("DONE."); m->mothurOutEndLine();
198                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
199         }
200         catch(exception& e) {
201                 m->errorOut(e, "Bayesian", "Bayesian");
202                 exit(1);
203         }
204 }
205 /**************************************************************************************************/
206 Bayesian::~Bayesian() {
207         try {
208                  delete phyloTree; 
209                  if (database != NULL) {  delete database; }
210         }
211         catch(exception& e) {
212                 m->errorOut(e, "Bayesian", "~Bayesian");
213                 exit(1);
214         }
215 }
216
217 /**************************************************************************************************/
218 string Bayesian::getTaxonomy(Sequence* seq) {
219         try {
220                 string tax = "";
221                 Kmer kmer(kmerSize);
222                 
223                 //get words contained in query
224                 //getKmerString returns a string where the index in the string is hte kmer number 
225                 //and the character at that index can be converted to be the number of times that kmer was seen
226
227                 string queryKmerString = kmer.getKmerString(seq->getUnaligned()); 
228
229                 vector<int> queryKmers;
230                 for (int i = 0; i < queryKmerString.length(); i++) {
231                         if (queryKmerString[i] != '!') { //this kmer is in the query
232                                 queryKmers.push_back(i);
233                         }
234                 }
235                 
236                 if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; }
237                 
238                 int index = getMostProbableTaxonomy(queryKmers);
239                 
240                 if (m->control_pressed) { return tax; }
241 //cout << seq->getName() << '\t' << index << endl;                                      
242                 //bootstrap - to set confidenceScore
243                 int numToSelect = queryKmers.size() / 8;
244                 tax = bootstrapResults(queryKmers, index, numToSelect);
245                                                 
246                 return tax;     
247         }
248         catch(exception& e) {
249                 m->errorOut(e, "Bayesian", "getTaxonomy");
250                 exit(1);
251         }
252 }
253 /**************************************************************************************************/
254 string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
255         try {
256                                 
257                 map<int, int> confidenceScores; 
258                                 
259                 map<int, int>::iterator itBoot;
260                 map<int, int>::iterator itBoot2;
261                 map<int, int>::iterator itConvert;
262                 
263                 for (int i = 0; i < iters; i++) {
264                         if (m->control_pressed) { return "control"; }
265                         
266                         vector<int> temp;
267                                                 
268                         for (int j = 0; j < numToSelect; j++) {
269                                 int index = int(rand() % kmers.size());
270                                 
271                                 //add word to temp
272                                 temp.push_back(kmers[index]);
273                         }
274                         
275                         //get taxonomy
276                         int newTax = getMostProbableTaxonomy(temp);
277                         TaxNode taxonomyTemp = phyloTree->get(newTax);
278         
279                         //add to confidence results
280                         while (taxonomyTemp.level != 0) { //while you are not at the root
281                                 
282                                 itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
283                                 
284                                 if (itBoot2 == confidenceScores.end()) { //not already in confidence scores
285                                         confidenceScores[newTax] = 1;
286                                 }else{
287                                         confidenceScores[newTax]++;
288                                 }
289                                 
290                                 newTax = taxonomyTemp.parent;
291                                 taxonomyTemp = phyloTree->get(newTax);
292                         }
293         
294                 }
295                 
296                 string confidenceTax = "";
297                 simpleTax = "";
298                 
299                 int seqTaxIndex = tax;
300                 TaxNode seqTax = phyloTree->get(tax);
301                 
302                 while (seqTax.level != 0) { //while you are not at the root
303                                         
304                                 itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
305                                 
306                                 int confidence = 0;
307                                 if (itBoot2 != confidenceScores.end()) { //already in confidence scores
308                                         confidence = confidenceScores[seqTaxIndex];
309                                 }
310                                 
311                                 if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
312                                         confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
313                                         simpleTax = seqTax.name + ";" + simpleTax;
314                                 }
315                                 
316                                 seqTaxIndex = seqTax.parent;
317                                 seqTax = phyloTree->get(seqTax.parent);
318                 }
319                 
320                 if (confidenceTax == "") { confidenceTax = "unclassified;"; simpleTax = "unclassified;"; }
321                 return confidenceTax;
322                 
323         }
324         catch(exception& e) {
325                 m->errorOut(e, "Bayesian", "bootstrapResults");
326                 exit(1);
327         }
328 }
329 /**************************************************************************************************/
330 int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
331         try {
332                 int indexofGenus = 0;
333                 
334                 double maxProbability = -1000000.0;
335                 //find taxonomy with highest probability that this sequence is from it
336                 for (int k = 0; k < genusNodes.size(); k++) {
337                         //for each taxonomy calc its probability
338                         double prob = 1.0;
339                         for (int i = 0; i < queryKmer.size(); i++) {
340                                 prob += wordGenusProb[queryKmer[i]][k];
341                         }
342                 
343                         //is this the taxonomy with the greatest probability?
344                         if (prob > maxProbability) { 
345                                 indexofGenus = genusNodes[k];
346                                 maxProbability = prob;
347                         }
348                 }
349
350                 return indexofGenus;
351         }
352         catch(exception& e) {
353                 m->errorOut(e, "Bayesian", "getMostProbableTaxonomy");
354                 exit(1);
355         }
356 }
357 /*************************************************************************************************
358 map<string, int> Bayesian::parseTaxMap(string newTax) {
359         try{
360         
361                 map<string, int> parsed;
362                 
363                 newTax = newTax.substr(0, newTax.length()-1);  //get rid of last ';'
364         
365                 //parse taxonomy
366                 string individual;
367                 while (newTax.find_first_of(';') != -1) {
368                         individual = newTax.substr(0,newTax.find_first_of(';'));
369                         newTax = newTax.substr(newTax.find_first_of(';')+1, newTax.length());
370                         parsed[individual] = 1;
371                 }
372                 
373                 //get last one
374                 parsed[newTax] = 1;
375
376                 return parsed;
377                 
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "Bayesian", "parseTax");
381                 exit(1);
382         }
383 }
384 /**************************************************************************************************/
385 void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
386         try{
387                 
388                 #ifdef USE_MPI
389                         
390                         int pid, num, num2, processors;
391                         vector<unsigned long int> positions;
392                         vector<unsigned long int> positions2;
393                         
394                         MPI_Status status; 
395                         MPI_File inMPI;
396                         MPI_File inMPI2;
397                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
398                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
399                         int tag = 2001;
400
401                         char inFileName[1024];
402                         strcpy(inFileName, inNumName.c_str());
403                         
404                         char inFileName2[1024];
405                         strcpy(inFileName2, inName.c_str());
406
407                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
408                         MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2);  //comm, filename, mode, info, filepointer
409
410                         if (pid == 0) {
411                                 positions = m->setFilePosEachLine(inNumName, num);
412                                 positions2 = m->setFilePosEachLine(inName, num2);
413                                 
414                                 for(int i = 1; i < processors; i++) { 
415                                         MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
417                                         
418                                         MPI_Send(&num2, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
419                                         MPI_Send(&positions2[0], (num2+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
420                                 }
421
422                         }else{
423                                 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
424                                 positions.resize(num+1);
425                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
426                                 
427                                 MPI_Recv(&num2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
428                                 positions2.resize(num2+1);
429                                 MPI_Recv(&positions2[0], (num2+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
430                         }
431                         
432                         //read version
433                         int length = positions2[1] - positions2[0];
434                         char* buf5 = new char[length];
435
436                         MPI_File_read_at(inMPI2, positions2[0], buf5, length, MPI_CHAR, &status);
437                         delete buf5;
438
439                         //read numKmers
440                         length = positions2[2] - positions2[1];
441                         char* buf = new char[length];
442
443                         MPI_File_read_at(inMPI2, positions2[1], buf, length, MPI_CHAR, &status);
444
445                         string tempBuf = buf;
446                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
447                         delete buf;
448
449                         istringstream iss (tempBuf,istringstream::in);
450                         iss >> numKmers;  
451                         
452                         //initialze probabilities
453                         wordGenusProb.resize(numKmers);
454                         
455                         for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
456                         
457                         int kmer, name;  
458                         vector<int> numbers; numbers.resize(numKmers);
459                         float prob;
460                         vector<float> zeroCountProb; zeroCountProb.resize(numKmers);    
461                         
462                         //read version
463                         length = positions[1] - positions[0];
464                         char* buf6 = new char[length];
465
466                         MPI_File_read_at(inMPI2, positions[0], buf6, length, MPI_CHAR, &status);
467                         delete buf6;
468                         
469                         //read file 
470                         for(int i=1;i<num;i++){
471                                 //read next sequence
472                                 length = positions[i+1] - positions[i];
473                                 char* buf4 = new char[length];
474
475                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
476
477                                 tempBuf = buf4;
478                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
479                                 delete buf4;
480
481                                 istringstream iss (tempBuf,istringstream::in);
482                                 iss >> zeroCountProb[i] >> numbers[i];  
483                         }
484                         
485                         MPI_File_close(&inMPI);
486                         
487                         for(int i=2;i<num2;i++){
488                                 //read next sequence
489                                 length = positions2[i+1] - positions2[i];
490                                 char* buf4 = new char[length];
491
492                                 MPI_File_read_at(inMPI2, positions2[i], buf4, length, MPI_CHAR, &status);
493
494                                 tempBuf = buf4;
495                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
496                                 delete buf4;
497
498                                 istringstream iss (tempBuf,istringstream::in);
499                                 
500                                 iss >> kmer;
501                                 
502                                 //set them all to zero value
503                                 for (int i = 0; i < genusNodes.size(); i++) {
504                                         wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
505                                 }
506                                 
507                                 //get probs for nonzero values
508                                 for (int i = 0; i < numbers[kmer]; i++) {
509                                         iss >> name >> prob;
510                                         wordGenusProb[kmer][name] = prob;
511                                 }
512                                 
513                         }
514                         MPI_File_close(&inMPI2);
515                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
516                 #else
517                         //read version
518                         string line = m->getline(in); m->gobble(in);
519                         
520                         in >> numKmers; m->gobble(in);
521                         
522                         //initialze probabilities
523                         wordGenusProb.resize(numKmers);
524                         
525                         for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
526                         
527                         int kmer, name, count;  count = 0;
528                         vector<int> num; num.resize(numKmers);
529                         float prob;
530                         vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
531                         
532                         //read version
533                         string line2 = m->getline(inNum); m->gobble(inNum);
534                         
535                         while (inNum) {
536                                 inNum >> zeroCountProb[count] >> num[count];  
537                                 count++;
538                                 m->gobble(inNum);
539                         }
540                         inNum.close();
541                 
542                         while(in) {
543                                 in >> kmer;
544                                 
545                                 //set them all to zero value
546                                 for (int i = 0; i < genusNodes.size(); i++) {
547                                         wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
548                                 }
549                                 
550                                 //get probs for nonzero values
551                                 for (int i = 0; i < num[kmer]; i++) {
552                                         in >> name >> prob;
553                                         wordGenusProb[kmer][name] = prob;
554                                 }
555                                 
556                                 m->gobble(in);
557                         }
558                         in.close();
559                         
560                 #endif
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "Bayesian", "readProbFile");
564                 exit(1);
565         }
566 }
567 /**************************************************************************************************/
568 bool Bayesian::checkReleaseDate(ifstream& file1, ifstream& file2, ifstream& file3, ifstream& file4) {
569         try {
570                 
571                 bool good = true;
572                 
573                 vector<string> lines;
574                 lines.push_back(m->getline(file1));  
575                 lines.push_back(m->getline(file2)); 
576                 lines.push_back(m->getline(file3)); 
577                 lines.push_back(m->getline(file4)); 
578
579                 //before we added this check
580                 if ((lines[0][0] != '#') || (lines[1][0] != '#') || (lines[2][0] != '#') || (lines[3][0] != '#')) {  good = false;  }
581                 else {
582                         //rip off #
583                         for (int i = 0; i < lines.size(); i++) { lines[i] = lines[i].substr(1);  }
584                         
585                         //get mothurs current version
586                         string version = m->getVersion();
587                         
588                         vector<string> versionVector;
589                         m->splitAtChar(version, versionVector, '.');
590                         
591                         //check each files version
592                         for (int i = 0; i < lines.size(); i++) { 
593                                 vector<string> linesVector;
594                                 m->splitAtChar(lines[i], linesVector, '.');
595                         
596                                 if (versionVector.size() != linesVector.size()) { good = false; break; }
597                                 else {
598                                         for (int j = 0; j < versionVector.size(); j++) {
599                                                 int num1, num2;
600                                                 convert(versionVector[j], num1);
601                                                 convert(linesVector[j], num2);
602                                                 
603                                                 //if mothurs version is newer than this files version, then we want to remake it
604                                                 if (num1 > num2) {  good = false; break;  }
605                                         }
606                                 }
607                                 
608                                 if (!good) { break; }
609                         }
610                 }
611                 
612                 if (!good) {  file1.close(); file2.close(); file3.close(); file4.close();  }
613                 else { file1.seekg(0);  file2.seekg(0);  file3.seekg(0);  file4.seekg(0);  }
614                 
615                 return good;
616         }
617         catch(exception& e) {
618                 m->errorOut(e, "Bayesian", "checkReleaseDate");
619                 exit(1);
620         }
621 }
622 /**************************************************************************************************/
623
624
625
626
627
628