]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23                 addSeqToTree("unknown", "unknown;");
24         }
25         catch(exception& e) {
26                 m->errorOut(e, "PhyloTree", "PhyloTree");
27                 exit(1);
28         }
29 }
30 /**************************************************************************************************/
31
32 PhyloTree::PhyloTree(ifstream& in, string filename){
33         try {
34                 m = MothurOut::getInstance();
35                 calcTotals = false;
36                 numNodes = 0;
37                 numSeqs = 0;
38                 
39                 #ifdef USE_MPI
40                         MPI_File inMPI;
41                         MPI_Offset size;
42                         MPI_Status status;
43
44                         char inFileName[1024];
45                         strcpy(inFileName, filename.c_str());
46
47                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
48                         MPI_File_get_size(inMPI, &size);
49                         
50                         char* buffer = new char[size];
51                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
52
53                         string tempBuf = buffer;
54                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
55                         istringstream iss (tempBuf,istringstream::in);
56                         delete buffer;
57                         
58                         //read version
59                         m->getline(iss); m->gobble(iss);
60                         
61                         iss >> numNodes; m->gobble(iss);
62                         
63                         tree.resize(numNodes);
64                         
65                         for (int i = 0; i < tree.size(); i++) {
66                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
67                         }
68                         
69                         //read genus nodes
70                         int numGenus = 0;
71                         iss >> numGenus; m->gobble(iss);
72                         
73                         int gnode, gsize;
74                         totals.clear();
75                         for (int i = 0; i < numGenus; i++) {
76                                 iss >> gnode >> gsize; m->gobble(iss);
77                                 
78                                 uniqueTaxonomies[gnode] = gnode;
79                                 totals.push_back(gsize);
80                         }
81                         
82                         MPI_File_close(&inMPI);
83                         
84                 #else
85                         //read version
86                         string line = m->getline(in); m->gobble(in);
87                         
88                         in >> numNodes; m->gobble(in);
89                         
90                         tree.resize(numNodes);
91                         
92                         for (int i = 0; i < tree.size(); i++) {
93                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
94                         }
95                         
96                         //read genus nodes
97                         int numGenus = 0;
98                         in >> numGenus; m->gobble(in);
99                         
100                         int gnode, gsize;
101                         totals.clear();
102                         for (int i = 0; i < numGenus; i++) {
103                                 in >> gnode >> gsize; m->gobble(in);
104                                 
105                                 uniqueTaxonomies[gnode] = gnode;
106                                 totals.push_back(gsize);
107                         }
108                         
109                         in.close();
110                         
111                 #endif
112                 
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "PhyloTree", "PhyloTree");
116                 exit(1);
117         }
118 }
119 /**************************************************************************************************/
120
121 PhyloTree::PhyloTree(string tfile){
122         try {
123                 m = MothurOut::getInstance();
124                 numNodes = 1;
125                 numSeqs = 0;
126                 tree.push_back(TaxNode("Root"));
127                 tree[0].heirarchyID = "0";
128                 maxLevel = 0;
129                 calcTotals = true;
130                 string name, tax;
131                 
132                 #ifdef USE_MPI
133                         int pid, num, processors;
134                         vector<unsigned long long> positions;
135                         
136                         MPI_Status status; 
137                         MPI_File inMPI;
138                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
139                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
140
141                         char inFileName[1024];
142                         strcpy(inFileName, tfile.c_str());
143
144                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
145
146                         if (pid == 0) {
147                                 positions = m->setFilePosEachLine(tfile, num);
148                                 
149                                 //send file positions to all processes
150                                 for(int i = 1; i < processors; i++) { 
151                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
152                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
153                                 }
154                         }else{
155                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
156                                 positions.resize(num+1);
157                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
158                         }
159                 
160                         //read file 
161                         for(int i=0;i<num;i++){
162                                 //read next sequence
163                                 int length = positions[i+1] - positions[i];
164                                 char* buf4 = new char[length];
165
166                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
167
168                                 string tempBuf = buf4;
169                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
170                                 delete buf4;
171
172                                 istringstream iss (tempBuf,istringstream::in);
173                                 iss >> name >> tax;
174                                 addSeqToTree(name, tax);
175                         }
176                         
177                         MPI_File_close(&inMPI);
178                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
179                 
180                 #else
181                         ifstream in;
182                         m->openInputFile(tfile, in);
183                         
184                         //read in users taxonomy file and add sequences to tree
185                         while(!in.eof()){
186                                 in >> name >> tax; m->gobble(in);
187                         
188                                 addSeqToTree(name, tax);
189                         }
190                         in.close();
191                 #endif
192         
193                 assignHeirarchyIDs(0);
194         
195         
196         string unknownTax = "unknown;";
197         //added last taxon until you get desired level
198                 for (int i = 1; i < maxLevel; i++) {
199                         unknownTax += "unclassfied;";
200                 }
201         
202         addSeqToTree("unknown", unknownTax);
203         
204                 //create file for summary if needed
205                 setUp(tfile);
206         }
207         catch(exception& e) {
208                 m->errorOut(e, "PhyloTree", "PhyloTree");
209                 exit(1);
210         }
211 }
212
213 /**************************************************************************************************/
214
215 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
216         try {
217                 string currentLevel = "";
218                 if(heirarchy != ""){
219                         int pos = heirarchy.find_first_of(';');
220                         
221                         if (pos == -1) { //you can't find another ;
222                                 currentLevel = heirarchy;
223                                 heirarchy = "";
224                                 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
225                         }else{
226                                 currentLevel=heirarchy.substr(0,pos);
227                                 if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
228                                 else { heirarchy = ""; }
229                         }
230                         
231                 }
232                 return currentLevel;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "PhyloTree", "getNextTaxon");
236                 exit(1);
237         }
238 }
239
240 /**************************************************************************************************/
241
242 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
243         try {
244                 numSeqs++;
245                 
246                 map<string, int>::iterator childPointer;
247                 
248                 int currentNode = 0;
249                 int level = 1;
250                 
251                 tree[0].accessions.push_back(seqName);
252                 m->removeConfidences(seqTaxonomy);
253                 
254                 string taxon;// = getNextTaxon(seqTaxonomy);
255         
256                 while(seqTaxonomy != ""){
257                         
258                         level++;
259                 
260                         if (m->control_pressed) { return 0; }
261                         
262                         //somehow the parent is getting one too many accnos
263                         //use print to reassign the taxa id
264                         taxon = getNextTaxon(seqTaxonomy, seqName);
265                         
266                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
267                         
268                         childPointer = tree[currentNode].children.find(taxon);
269                         
270                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
271                                 currentNode = childPointer->second;
272                                 tree[currentNode].accessions.push_back(seqName);
273                                 name2Taxonomy[seqName] = currentNode;
274                         }
275                         else{                                                                                   //otherwise, create it
276                                 tree.push_back(TaxNode(taxon));
277                                 numNodes++;
278                                 tree[currentNode].children[taxon] = numNodes-1;
279                                 tree[numNodes-1].parent = currentNode;
280                                 
281                                 currentNode = tree[currentNode].children[taxon];
282                                 tree[currentNode].accessions.push_back(seqName);
283                                 name2Taxonomy[seqName] = currentNode;
284                         }
285         
286                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
287                 }
288                 
289                 return 0;
290         }
291         catch(exception& e) {
292                 m->errorOut(e, "PhyloTree", "addSeqToTree");
293                 exit(1);
294         }
295 }
296 /**************************************************************************************************/
297 vector<int> PhyloTree::getGenusNodes()  {
298         try {
299                 genusIndex.clear();
300                 //generate genusIndexes
301                 map<int, int>::iterator it2;
302                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
303                 
304                 return genusIndex;
305         }
306         catch(exception& e) {
307                 m->errorOut(e, "PhyloTree", "getGenusNodes");
308                 exit(1);
309         }
310 }
311 /**************************************************************************************************/
312 vector<int> PhyloTree::getGenusTotals() {
313         try {
314         
315                 if (calcTotals) {
316                         totals.clear();
317                         //reset counts because we are on a new word
318                         for (int j = 0; j < genusIndex.size(); j++) {
319                                 totals.push_back(tree[genusIndex[j]].accessions.size());
320                         }
321                         return totals;
322                 }else{
323                         return totals;
324                 }
325                 
326         }
327         catch(exception& e) {
328                 m->errorOut(e, "PhyloTree", "getGenusNodes");
329                 exit(1);
330         }
331 }
332 /**************************************************************************************************/
333
334 void PhyloTree::assignHeirarchyIDs(int index){
335         try {
336                 map<string,int>::iterator it;
337                 int counter = 1;
338                 
339                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
340                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
341                         counter++;
342                         tree[it->second].level = tree[index].level + 1;
343                                                 
344                         //save maxLevel for binning the unclassified seqs
345                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
346                         
347                         assignHeirarchyIDs(it->second);
348                 }
349         }
350         catch(exception& e) {
351                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
352                 exit(1);
353         }
354 }
355 /**************************************************************************************************/
356 void PhyloTree::setUp(string tfile){
357         try{
358                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
359                 
360                 #ifdef USE_MPI
361                         int pid;
362                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
363
364                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
365                 
366                 #else
367                         binUnclassified(taxFileNameTest); 
368                 #endif
369         }
370         catch(exception& e) {
371                 m->errorOut(e, "PhyloTree", "setUp");
372                 exit(1);
373         }
374 }
375 /**************************************************************************************************/
376 void PhyloTree::binUnclassified(string file){
377         try {
378         
379                 ofstream out;
380                 m->openOutputFile(file, out);
381                 
382                 map<string, int>::iterator itBin;
383                 map<string, int>::iterator childPointer;
384                 
385                 vector<TaxNode> copy = tree;
386                 
387                 //fill out tree
388                 fillOutTree(0, copy);
389         
390                 //get leaf nodes that may need extension
391                 for (int i = 0; i < copy.size(); i++) {  
392
393                         if (copy[i].children.size() == 0) {
394                                 leafNodes[i] = i;
395                         }
396                 }
397                 
398                 int copyNodes = copy.size();
399         
400                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
401                 map<int, int>::iterator itLeaf;
402                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
403                         
404                         if (m->control_pressed) {  out.close(); break;  }
405                         
406                         int level = copy[itLeaf->second].level;
407                         int currentNode = itLeaf->second;
408                         
409                         //this sequence is unclassified at some levels
410                         while(level < maxLevel){
411                 
412                                 level++;
413                         
414                                 string taxon = "unclassified";  
415                                 
416                                 //does the parent have a child names 'unclassified'?
417                                 childPointer = copy[currentNode].children.find(taxon);
418                                 
419                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
420                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
421                                 }
422                                 else{                                                                                   //otherwise, create it
423                                         copy.push_back(TaxNode(taxon));
424                                         copyNodes++;
425                                         copy[currentNode].children[taxon] = copyNodes-1;
426                                         copy[copyNodes-1].parent = currentNode;
427                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
428                                                                         
429                                         currentNode = copy[currentNode].children[taxon];
430                                 }
431                         }
432                 }
433                 
434                 if (!m->control_pressed) {
435                         //print copy tree
436                         print(out, copy);
437                 }
438                                 
439         }
440         catch(exception& e) {
441                 m->errorOut(e, "PhyloTree", "binUnclassified");
442                 exit(1);
443         }
444 }
445 /**************************************************************************************************/
446 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
447         try {
448         
449                 map<string,int>::iterator it;
450                 
451                 it = copy[index].children.find("unclassified");
452                 if (it == copy[index].children.end()) { //no unclassified at this level
453                         string taxon = "unclassified";
454                         copy.push_back(TaxNode(taxon));
455                         copy[index].children[taxon] = copy.size()-1;
456                         copy[copy.size()-1].parent = index;
457                         copy[copy.size()-1].level = copy[index].level + 1;
458                 }
459                 
460                 if (tree[index].level < maxLevel) {
461                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
462                                 fillOutTree(it->second, copy);
463                         }
464                 }
465
466         }
467         catch(exception& e) {
468                 m->errorOut(e, "PhyloTree", "fillOutTree");
469                 exit(1);
470         }
471 }
472 /**************************************************************************************************/
473 string PhyloTree::getFullTaxonomy(string seqName) {
474         try {
475                 string tax = "";
476                 
477                 int currentNode = name2Taxonomy[seqName];
478                 
479                 while (tree[currentNode].parent != -1) {
480                         tax = tree[currentNode].name + ";" + tax;
481                         currentNode = tree[currentNode].parent;
482                 }
483                 
484                 return tax;
485         }
486         catch(exception& e) {
487                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
488                 exit(1);
489         }
490 }
491 /**************************************************************************************************/
492
493 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
494         try {
495                 
496                 //output mothur version
497                 out << "#" << m->getVersion() << endl;
498                 
499                 out << copy.size() << endl;
500                 
501                 out << maxLevel << endl;
502                                 
503                 for (int i = 0; i < copy.size(); i++) {
504                                 
505                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
506                         
507                         map<string,int>::iterator it;
508                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
509                                 out << it->first << '\t' << it->second << '\t';
510                         }
511                         out << endl;
512                 }
513                 
514                 out.close();
515         }
516         catch(exception& e) {
517                 m->errorOut(e, "PhyloTree", "print");
518                 exit(1);
519         }
520 }
521 /**************************************************************************************************/
522 void PhyloTree::printTreeNodes(string treefilename) {
523         try {
524         
525                 #ifdef USE_MPI
526                         int pid;
527                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
528
529                         if (pid == 0) {  
530                 
531                 #endif
532
533                         ofstream outTree;
534                         m->openOutputFile(treefilename, outTree);
535                         
536                         //output mothur version
537                         outTree << "#" << m->getVersion() << endl;
538                         
539                         //print treenodes
540                         outTree << tree.size() << endl;
541                         for (int i = 0; i < tree.size(); i++) {
542                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
543                         }
544                         
545                         //print genus nodes
546                         outTree << endl << uniqueTaxonomies.size() << endl;
547                         map<int, int>::iterator it2;
548                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
549                         outTree << endl;
550                         
551                         outTree.close();
552                 
553                 #ifdef USE_MPI
554                         }
555                 #endif
556
557                 
558         }
559         catch(exception& e) {
560                 m->errorOut(e, "PhyloTree", "printTreeNodes");
561                 exit(1);
562         }
563 }
564 /**************************************************************************************************/
565 TaxNode PhyloTree::get(int i ){
566         try {
567                 if (i < tree.size()) {  return tree[i];  }
568                 else {  cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
569         }
570         catch(exception& e) {
571                 m->errorOut(e, "PhyloTree", "get");
572                 exit(1);
573         }
574 }
575 /**************************************************************************************************/
576 TaxNode PhyloTree::get(string seqName){
577         try {
578                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
579         
580                 if (itFind != name2Taxonomy.end()) {  return tree[name2Taxonomy[seqName]];  }
581                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
582         }
583         catch(exception& e) {
584                 m->errorOut(e, "PhyloTree", "get");
585                 exit(1);
586         }
587 }
588 /**************************************************************************************************/
589 string PhyloTree::getName(int i ){
590         try {
591                 if (i < tree.size()) {  return tree[i].name;     }
592                 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
593         }
594         catch(exception& e) {
595                 m->errorOut(e, "PhyloTree", "get");
596                 exit(1);
597         }
598 }
599 /**************************************************************************************************/
600 int PhyloTree::getIndex(string seqName){
601         try {
602                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
603         
604                 if (itFind != name2Taxonomy.end()) {  return name2Taxonomy[seqName];  }
605                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
606         }
607         catch(exception& e) {
608                 m->errorOut(e, "PhyloTree", "get");
609                 exit(1);
610         }
611 }
612 /**************************************************************************************************/
613 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
614         try {
615         
616                 bool okay = true;
617                 templateFileNames.push_back("unknown");
618                 
619                 map<string, int>::iterator itFind;
620                 map<string, int> taxonomyFileNames = name2Taxonomy;
621                 
622         if (m->debug) { m->mothurOut("[DEBUG]: in error check. Numseqs in template = " + toString(templateFileNames.size()) + ". Numseqs in taxonomy = " + toString(taxonomyFileNames.size()) + ".\n"); }
623         
624                 for (int i = 0; i < templateFileNames.size(); i++) {
625                         itFind = taxonomyFileNames.find(templateFileNames[i]);
626                         
627                         if (itFind != taxonomyFileNames.end()) { //found it so erase it
628                                 taxonomyFileNames.erase(itFind);
629                         }else {
630                                 m->mothurOut("'" +templateFileNames[i] + "' is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
631                                 okay = false;
632                         }
633                         
634                         //templateFileNames.erase(templateFileNames.begin()+i);
635                         //i--;
636                 }
637                 templateFileNames.clear();
638                 
639                 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
640                         okay = false;
641                         
642                         for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
643                                 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
644                         }
645                 }
646                 
647                 return okay;
648         }
649         catch(exception& e) {
650                 m->errorOut(e, "PhyloTree", "ErrorCheck");
651                 exit(1);
652         }
653 }
654 /**************************************************************************************************/
655         
656
657
658