]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
changed random forest output filename
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23                 addSeqToTree("unknown", "unknown;");
24         }
25         catch(exception& e) {
26                 m->errorOut(e, "PhyloTree", "PhyloTree");
27                 exit(1);
28         }
29 }
30 /**************************************************************************************************/
31
32 PhyloTree::PhyloTree(ifstream& in, string filename){
33         try {
34                 m = MothurOut::getInstance();
35                 calcTotals = false;
36                 numNodes = 0;
37                 numSeqs = 0;
38                 
39                 #ifdef USE_MPI
40                         MPI_File inMPI;
41                         MPI_Offset size;
42                         MPI_Status status;
43
44                         char inFileName[1024];
45                         strcpy(inFileName, filename.c_str());
46
47                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
48                         MPI_File_get_size(inMPI, &size);
49                         
50                         char* buffer = new char[size];
51                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
52
53                         string tempBuf = buffer;
54                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
55                         istringstream iss (tempBuf,istringstream::in);
56                         delete buffer;
57                         
58                         //read version
59                         m->getline(iss); m->gobble(iss);
60                         
61                         iss >> numNodes; m->gobble(iss);
62                         
63                         tree.resize(numNodes);
64                         
65                         for (int i = 0; i < tree.size(); i++) {
66                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
67                         }
68                         
69                         //read genus nodes
70                         int numGenus = 0;
71                         iss >> numGenus; m->gobble(iss);
72                         
73                         int gnode, gsize;
74                         totals.clear();
75                         for (int i = 0; i < numGenus; i++) {
76                                 iss >> gnode >> gsize; m->gobble(iss);
77                                 
78                                 uniqueTaxonomies.insert(gnode);
79                                 totals.push_back(gsize);
80                         }
81                         
82                         MPI_File_close(&inMPI);
83                         
84                 #else
85                         //read version
86                         string line = m->getline(in); m->gobble(in);
87                         
88                         in >> numNodes; m->gobble(in);
89                         
90                         tree.resize(numNodes);
91                         
92                         for (int i = 0; i < tree.size(); i++) {
93                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
94                         }
95                         
96                         //read genus nodes
97                         int numGenus = 0;
98                         in >> numGenus; m->gobble(in);
99                         
100                         int gnode, gsize;
101                         totals.clear();
102                         for (int i = 0; i < numGenus; i++) {
103                                 in >> gnode >> gsize; m->gobble(in);
104                                 
105                                 uniqueTaxonomies.insert(gnode);
106                                 totals.push_back(gsize);
107                         }
108                         
109                         in.close();
110                         
111                 #endif
112                 
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "PhyloTree", "PhyloTree");
116                 exit(1);
117         }
118 }
119 /**************************************************************************************************/
120
121 PhyloTree::PhyloTree(string tfile){
122         try {
123                 m = MothurOut::getInstance();
124                 numNodes = 1;
125                 numSeqs = 0;
126                 tree.push_back(TaxNode("Root"));
127                 tree[0].heirarchyID = "0";
128                 maxLevel = 0;
129                 calcTotals = true;
130                 string name, tax;
131                 
132                 #ifdef USE_MPI
133                         int pid, num, processors;
134                         vector<unsigned long long> positions;
135                         
136                         MPI_Status status; 
137                         MPI_File inMPI;
138                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
139                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
140
141                         char inFileName[1024];
142                         strcpy(inFileName, tfile.c_str());
143
144                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
145
146                         if (pid == 0) {
147                                 positions = m->setFilePosEachLine(tfile, num);
148                                 
149                                 //send file positions to all processes
150                                 for(int i = 1; i < processors; i++) { 
151                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
152                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
153                                 }
154                         }else{
155                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
156                                 positions.resize(num+1);
157                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
158                         }
159                 
160                         //read file 
161                         for(int i=0;i<num;i++){
162                                 //read next sequence
163                                 int length = positions[i+1] - positions[i];
164                                 char* buf4 = new char[length];
165
166                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
167
168                                 string tempBuf = buf4;
169                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
170                                 delete buf4;
171
172                                 istringstream iss (tempBuf,istringstream::in);
173                                 iss >> name >> tax;
174                                 addSeqToTree(name, tax);
175                         }
176                         
177                         MPI_File_close(&inMPI);
178                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
179                 
180                 #else
181             map<string, string> temp;
182             m->readTax(tfile, temp);
183         
184             for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
185                 addSeqToTree(itTemp->first, itTemp->second);
186                 temp.erase(itTemp++);
187             }
188                 #endif
189         
190                 assignHeirarchyIDs(0);
191         
192         
193         string unknownTax = "unknown;";
194         //added last taxon until you get desired level
195                 for (int i = 1; i < maxLevel; i++) {
196                         unknownTax += "unclassfied;";
197                 }
198         
199         addSeqToTree("unknown", unknownTax);
200         
201                 //create file for summary if needed
202                 setUp(tfile);
203         }
204         catch(exception& e) {
205                 m->errorOut(e, "PhyloTree", "PhyloTree");
206                 exit(1);
207         }
208 }
209
210 /**************************************************************************************************/
211
212 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
213         try {
214                 string currentLevel = "";
215                 if(heirarchy != ""){
216                         int pos = heirarchy.find_first_of(';');
217                         
218                         if (pos == -1) { //you can't find another ;
219                                 currentLevel = heirarchy;
220                                 heirarchy = "";
221                                 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
222                         }else{
223                                 currentLevel=heirarchy.substr(0,pos);
224                                 if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
225                                 else { heirarchy = ""; }
226                         }
227                         
228                 }
229                 return currentLevel;
230         }
231         catch(exception& e) {
232                 m->errorOut(e, "PhyloTree", "getNextTaxon");
233                 exit(1);
234         }
235 }
236
237 /**************************************************************************************************/
238
239 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
240         try {
241                 numSeqs++;
242                 
243                 map<string, int>::iterator childPointer;
244                 
245                 int currentNode = 0;
246                 int level = 1;
247                 
248                 tree[0].accessions.push_back(seqName);
249                 m->removeConfidences(seqTaxonomy);
250                 
251                 string taxon;// = getNextTaxon(seqTaxonomy);
252         
253                 while(seqTaxonomy != ""){
254                         
255                         level++;
256                 
257                         if (m->control_pressed) { return 0; }
258                         
259                         //somehow the parent is getting one too many accnos
260                         //use print to reassign the taxa id
261                         taxon = getNextTaxon(seqTaxonomy, seqName);
262             
263             if (m->debug) { m->mothurOut(seqName +'\t' + taxon +'\n'); }
264                         
265                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies.insert(currentNode); } break;  }
266                         
267                         childPointer = tree[currentNode].children.find(taxon);
268                         
269                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
270                                 currentNode = childPointer->second;
271                                 tree[currentNode].accessions.push_back(seqName);
272                                 name2Taxonomy[seqName] = currentNode;
273                         }
274                         else{                                                                                   //otherwise, create it
275                                 tree.push_back(TaxNode(taxon));
276                                 numNodes++;
277                                 tree[currentNode].children[taxon] = numNodes-1;
278                                 tree[numNodes-1].parent = currentNode;
279                                 
280                                 currentNode = tree[currentNode].children[taxon];
281                                 tree[currentNode].accessions.push_back(seqName);
282                                 name2Taxonomy[seqName] = currentNode;
283                         }
284         
285                         if (seqTaxonomy == "") {   uniqueTaxonomies.insert(currentNode);        }
286                 }
287                 
288                 return 0;
289         }
290         catch(exception& e) {
291                 m->errorOut(e, "PhyloTree", "addSeqToTree");
292                 exit(1);
293         }
294 }
295 /**************************************************************************************************/
296 vector<int> PhyloTree::getGenusNodes()  {
297         try {
298                 genusIndex.clear();
299                 //generate genusIndexes
300                 set<int>::iterator it2;
301         map<int, int> temp;
302                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(*it2);   temp[*it2] = genusIndex.size()-1; }
303                 
304         for (map<string, int>::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) {
305             map<int, int>::iterator itTemp = temp.find(itName->second);
306             if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; }
307             else {  m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; }
308         }
309         
310                 return genusIndex;
311         }
312         catch(exception& e) {
313                 m->errorOut(e, "PhyloTree", "getGenusNodes");
314                 exit(1);
315         }
316 }
317 /**************************************************************************************************/
318 vector<int> PhyloTree::getGenusTotals() {
319         try {
320         
321                 if (calcTotals) {
322                         totals.clear();
323                         //reset counts because we are on a new word
324                         for (int j = 0; j < genusIndex.size(); j++) {
325                                 totals.push_back(tree[genusIndex[j]].accessions.size());
326                         }
327                         return totals;
328                 }else{
329                         return totals;
330                 }
331                 
332         }
333         catch(exception& e) {
334                 m->errorOut(e, "PhyloTree", "getGenusNodes");
335                 exit(1);
336         }
337 }
338 /**************************************************************************************************/
339
340 void PhyloTree::assignHeirarchyIDs(int index){
341         try {
342                 map<string,int>::iterator it;
343                 int counter = 1;
344                 
345                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
346             
347             if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); }
348                 
349                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
350                         counter++;
351                         tree[it->second].level = tree[index].level + 1;
352                                                 
353                         //save maxLevel for binning the unclassified seqs
354                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
355                         
356                         assignHeirarchyIDs(it->second);
357                 }
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
361                 exit(1);
362         }
363 }
364 /**************************************************************************************************/
365 void PhyloTree::setUp(string tfile){
366         try{
367                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
368                 
369                 #ifdef USE_MPI
370                         int pid;
371                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
372
373                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
374                 
375                 #else
376                         binUnclassified(taxFileNameTest); 
377                 #endif
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "PhyloTree", "setUp");
381                 exit(1);
382         }
383 }
384 /**************************************************************************************************/
385 void PhyloTree::binUnclassified(string file){
386         try {
387         
388                 ofstream out;
389                 m->openOutputFile(file, out);
390                 
391                 map<string, int>::iterator itBin;
392                 map<string, int>::iterator childPointer;
393                 
394                 vector<TaxNode> copy = tree;
395                 
396                 //fill out tree
397                 fillOutTree(0, copy);
398         
399                 //get leaf nodes that may need extension
400                 for (int i = 0; i < copy.size(); i++) {  
401
402                         if (copy[i].children.size() == 0) {
403                                 leafNodes[i] = i;
404                         }
405                 }
406                 
407         if (m->debug) { m->mothurOut("maxLevel = " + toString(maxLevel) +'\n'); }
408         
409                 int copyNodes = copy.size();
410         
411                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
412                 map<int, int>::iterator itLeaf;
413                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
414                         
415                         if (m->control_pressed) {  out.close(); break;  }
416                         
417                         int level = copy[itLeaf->second].level;
418                         int currentNode = itLeaf->second;
419             
420             if (m->debug) { m->mothurOut(copy[currentNode].name +'\n'); }
421                         
422                         //this sequence is unclassified at some levels
423                         while(level < maxLevel){
424                 
425                                 level++;
426                 if (m->debug) { m->mothurOut("level = " + toString(level) +'\n'); }
427                         
428                                 string taxon = "unclassified";  
429                                 
430                                 //does the parent have a child names 'unclassified'?
431                                 childPointer = copy[currentNode].children.find(taxon);
432                                 
433                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
434                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
435                                 }
436                                 else{                                                                                   //otherwise, create it
437                                         copy.push_back(TaxNode(taxon));
438                                         copyNodes++;
439                                         copy[currentNode].children[taxon] = copyNodes-1;
440                                         copy[copyNodes-1].parent = currentNode;
441                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
442                                                                         
443                                         currentNode = copy[currentNode].children[taxon];
444                                 }
445                         }
446                 }
447                 
448                 if (!m->control_pressed) {
449                         //print copy tree
450                         print(out, copy);
451                 }
452                                 
453         }
454         catch(exception& e) {
455                 m->errorOut(e, "PhyloTree", "binUnclassified");
456                 exit(1);
457         }
458 }
459 /**************************************************************************************************/
460 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
461         try {
462         
463                 map<string,int>::iterator it;
464                 
465                 it = copy[index].children.find("unclassified");
466                 if (it == copy[index].children.end()) { //no unclassified at this level
467                         string taxon = "unclassified";
468                         copy.push_back(TaxNode(taxon));
469                         copy[index].children[taxon] = copy.size()-1;
470                         copy[copy.size()-1].parent = index;
471                         copy[copy.size()-1].level = copy[index].level + 1;
472                 }
473                 
474                 if (tree[index].level < maxLevel) {
475                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
476                                 fillOutTree(it->second, copy);
477                         }
478                 }
479
480         }
481         catch(exception& e) {
482                 m->errorOut(e, "PhyloTree", "fillOutTree");
483                 exit(1);
484         }
485 }
486 /**************************************************************************************************/
487 string PhyloTree::getFullTaxonomy(string seqName) {
488         try {
489                 string tax = "";
490                 
491                 int currentNode = name2Taxonomy[seqName];
492                 
493                 while (tree[currentNode].parent != -1) {
494                         tax = tree[currentNode].name + ";" + tax;
495                         currentNode = tree[currentNode].parent;
496                 }
497                 
498                 return tax;
499         }
500         catch(exception& e) {
501                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
502                 exit(1);
503         }
504 }
505 /**************************************************************************************************/
506
507 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
508         try {
509                 
510                 //output mothur version
511                 out << "#" << m->getVersion() << endl;
512                 
513                 out << copy.size() << endl;
514                 
515                 out << maxLevel << endl;
516                                 
517                 for (int i = 0; i < copy.size(); i++) {
518                                 
519                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
520                         
521                         map<string,int>::iterator it;
522                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
523                                 out << it->first << '\t' << it->second << '\t';
524                         }
525                         out << endl;
526                 }
527                 
528                 out.close();
529         }
530         catch(exception& e) {
531                 m->errorOut(e, "PhyloTree", "print");
532                 exit(1);
533         }
534 }
535 /**************************************************************************************************/
536 void PhyloTree::printTreeNodes(string treefilename) {
537         try {
538         
539                 #ifdef USE_MPI
540                         int pid;
541                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
542
543                         if (pid == 0) {  
544                 
545                 #endif
546
547                         ofstream outTree;
548                         m->openOutputFile(treefilename, outTree);
549                         
550                         //output mothur version
551                         outTree << "#" << m->getVersion() << endl;
552                         
553                         //print treenodes
554                         outTree << tree.size() << endl;
555                         for (int i = 0; i < tree.size(); i++) {
556                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
557                         }
558                         
559                         //print genus nodes
560                         outTree << endl << uniqueTaxonomies.size() << endl;
561                         set<int>::iterator it2;
562                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl;      }
563                         outTree << endl;
564                         
565                         outTree.close();
566                 
567                 #ifdef USE_MPI
568                         }
569                 #endif
570
571                 
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "PhyloTree", "printTreeNodes");
575                 exit(1);
576         }
577 }
578 /**************************************************************************************************/
579 TaxNode PhyloTree::get(int i ){
580         try {
581                 if (i < tree.size()) {  return tree[i];  }
582                 else {  cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
583         }
584         catch(exception& e) {
585                 m->errorOut(e, "PhyloTree", "get");
586                 exit(1);
587         }
588 }
589 /**************************************************************************************************/
590 TaxNode PhyloTree::get(string seqName){
591         try {
592                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
593         
594                 if (itFind != name2Taxonomy.end()) {  return tree[name2Taxonomy[seqName]];  }
595                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
596         }
597         catch(exception& e) {
598                 m->errorOut(e, "PhyloTree", "get");
599                 exit(1);
600         }
601 }
602 /**************************************************************************************************/
603 string PhyloTree::getName(int i ){
604         try {
605                 if (i < tree.size()) {  return tree[i].name;     }
606                 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
607         }
608         catch(exception& e) {
609                 m->errorOut(e, "PhyloTree", "get");
610                 exit(1);
611         }
612 }
613 /**************************************************************************************************/
614 int PhyloTree::getGenusIndex(string seqName){
615         try {
616                 map<string, int>::iterator itFind = name2GenusNodeIndex.find(seqName);
617         
618                 if (itFind != name2GenusNodeIndex.end()) {  return itFind->second;  }
619                 else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
620         }
621         catch(exception& e) {
622                 m->errorOut(e, "PhyloTree", "get");
623                 exit(1);
624         }
625 }
626 /**************************************************************************************************/
627 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
628         try {
629         
630                 bool okay = true;
631                 templateFileNames.push_back("unknown");
632                 
633                 map<string, int>::iterator itFind;
634                 map<string, int> taxonomyFileNames = name2Taxonomy;
635                 
636         if (m->debug) { m->mothurOut("[DEBUG]: in error check. Numseqs in template = " + toString(templateFileNames.size()) + ". Numseqs in taxonomy = " + toString(taxonomyFileNames.size()) + ".\n"); }
637         
638                 for (int i = 0; i < templateFileNames.size(); i++) {
639                         itFind = taxonomyFileNames.find(templateFileNames[i]);
640                         
641                         if (itFind != taxonomyFileNames.end()) { //found it so erase it
642                                 taxonomyFileNames.erase(itFind);
643                         }else {
644                                 m->mothurOut("'" +templateFileNames[i] + "' is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
645                                 okay = false;
646                         }
647                         
648                         //templateFileNames.erase(templateFileNames.begin()+i);
649                         //i--;
650                 }
651                 templateFileNames.clear();
652                 
653                 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
654                         okay = false;
655                         
656                         for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
657                                 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
658                         }
659                 }
660                 
661                 return okay;
662         }
663         catch(exception& e) {
664                 m->errorOut(e, "PhyloTree", "ErrorCheck");
665                 exit(1);
666         }
667 }
668 /**************************************************************************************************/
669         
670
671
672