]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
changes for 1.12.2
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "PhyloTree", "PhyloTree");
26                 exit(1);
27         }
28 }
29 /**************************************************************************************************/
30
31 PhyloTree::PhyloTree(ifstream& in, string filename){
32         try {
33                 m = MothurOut::getInstance();
34                 calcTotals = false;
35                 numNodes = 0;
36                 numSeqs = 0;
37                 
38                 #ifdef USE_MPI
39                         MPI_File inMPI;
40                         MPI_Offset size;
41                         MPI_Status status;
42
43                         char inFileName[1024];
44                         strcpy(inFileName, filename.c_str());
45
46                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
47                         MPI_File_get_size(inMPI, &size);
48                         
49                         char* buffer = new char[size];
50                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
51
52                         string tempBuf = buffer;
53                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
54                         istringstream iss (tempBuf,istringstream::in);
55                         delete buffer;
56                         
57                         iss >> numNodes; gobble(iss);
58                         
59                         tree.resize(numNodes);
60                         
61                         for (int i = 0; i < tree.size(); i++) {
62                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
63                         }
64                         
65                         //read genus nodes
66                         int numGenus = 0;
67                         iss >> numGenus; gobble(iss);
68                         
69                         int gnode, gsize;
70                         totals.clear();
71                         for (int i = 0; i < numGenus; i++) {
72                                 iss >> gnode >> gsize; gobble(iss);
73                                 
74                                 uniqueTaxonomies[gnode] = gnode;
75                                 totals.push_back(gsize);
76                         }
77                         
78                         MPI_File_close(&inMPI);
79                         
80                 #else
81                         in >> numNodes; gobble(in);
82                         
83                         tree.resize(numNodes);
84                         
85                         for (int i = 0; i < tree.size(); i++) {
86                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
87                         }
88                         
89                         //read genus nodes
90                         int numGenus = 0;
91                         in >> numGenus; gobble(in);
92                         
93                         int gnode, gsize;
94                         totals.clear();
95                         for (int i = 0; i < numGenus; i++) {
96                                 in >> gnode >> gsize; gobble(in);
97                                 
98                                 uniqueTaxonomies[gnode] = gnode;
99                                 totals.push_back(gsize);
100                         }
101                         
102                         in.close();
103                         
104                 #endif
105                 
106         }
107         catch(exception& e) {
108                 m->errorOut(e, "PhyloTree", "PhyloTree");
109                 exit(1);
110         }
111 }
112 /**************************************************************************************************/
113
114 PhyloTree::PhyloTree(string tfile){
115         try {
116                 m = MothurOut::getInstance();
117                 numNodes = 1;
118                 numSeqs = 0;
119                 tree.push_back(TaxNode("Root"));
120                 tree[0].heirarchyID = "0";
121                 maxLevel = 0;
122                 calcTotals = true;
123                 string name, tax;
124
125                 
126                 #ifdef USE_MPI
127                         int pid, num, processors;
128                         vector<unsigned long int> positions;
129                         
130                         MPI_Status status; 
131                         MPI_File inMPI;
132                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
133                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
134
135                         char inFileName[1024];
136                         strcpy(inFileName, tfile.c_str());
137
138                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
139
140                         if (pid == 0) {
141                                 positions = setFilePosEachLine(tfile, num);
142                                 
143                                 //send file positions to all processes
144                                 for(int i = 1; i < processors; i++) { 
145                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
146                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
147                                 }
148                         }else{
149                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
150                                 positions.resize(num+1);
151                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
152                         }
153                 
154                         //read file 
155                         for(int i=0;i<num;i++){
156                                 //read next sequence
157                                 int length = positions[i+1] - positions[i];
158                                 char* buf4 = new char[length];
159
160                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
161
162                                 string tempBuf = buf4;
163                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
164                                 delete buf4;
165
166                                 istringstream iss (tempBuf,istringstream::in);
167                                 iss >> name >> tax;
168                                 addSeqToTree(name, tax);
169                         }
170                         
171                         MPI_File_close(&inMPI);
172                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
173                 
174                 #else
175                         ifstream in;
176                         openInputFile(tfile, in);
177                         
178                         //read in users taxonomy file and add sequences to tree
179                         while(!in.eof()){
180                                 in >> name >> tax; gobble(in);
181                                 
182                                 addSeqToTree(name, tax);
183                         }
184                         in.close();
185                 #endif
186                 
187                 assignHeirarchyIDs(0);
188                 
189                 //create file for summary if needed
190                 setUp(tfile);
191         }
192         catch(exception& e) {
193                 m->errorOut(e, "PhyloTree", "PhyloTree");
194                 exit(1);
195         }
196 }
197
198 /**************************************************************************************************/
199
200 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
201         try {
202                 string currentLevel = "";
203                 if(heirarchy != ""){
204                         int pos = heirarchy.find_first_of(';');
205                         
206                         if (pos == -1) { //you can't find another ;
207                                 currentLevel = heirarchy;
208                                 heirarchy = "";
209                                 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
210                         }else{
211                                 currentLevel=heirarchy.substr(0,pos);
212                                 if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
213                                 else { heirarchy = ""; }
214                         }
215                         
216                 }
217                 return currentLevel;
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "PhyloTree", "getNextTaxon");
221                 exit(1);
222         }
223 }
224
225 /**************************************************************************************************/
226
227 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
228         try {
229                         
230                 numSeqs++;
231                 
232                 map<string, int>::iterator childPointer;
233                 
234                 int currentNode = 0;
235                 int level = 1;
236                 
237                 tree[0].accessions.push_back(seqName);
238                 string taxon;// = getNextTaxon(seqTaxonomy);
239         
240                 while(seqTaxonomy != ""){
241                         
242                         level++;
243                 
244                         if (m->control_pressed) { return 0; }
245                         
246                         //somehow the parent is getting one too many accnos
247                         //use print to reassign the taxa id
248                         taxon = getNextTaxon(seqTaxonomy, seqName);
249                         
250                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
251                         
252                         childPointer = tree[currentNode].children.find(taxon);
253                         
254                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
255                                 currentNode = childPointer->second;
256                                 tree[currentNode].accessions.push_back(seqName);
257                                 name2Taxonomy[seqName] = currentNode;
258                         }
259                         else{                                                                                   //otherwise, create it
260                                 tree.push_back(TaxNode(taxon));
261                                 numNodes++;
262                                 tree[currentNode].children[taxon] = numNodes-1;
263                                 tree[numNodes-1].parent = currentNode;
264                                 
265                                 currentNode = tree[currentNode].children[taxon];
266                                 tree[currentNode].accessions.push_back(seqName);
267                                 name2Taxonomy[seqName] = currentNode;
268                         }
269         
270                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
271                 }
272         }
273         catch(exception& e) {
274                 m->errorOut(e, "PhyloTree", "addSeqToTree");
275                 exit(1);
276         }
277 }
278 /**************************************************************************************************/
279 vector<int> PhyloTree::getGenusNodes()  {
280         try {
281                 genusIndex.clear();
282                 //generate genusIndexes
283                 map<int, int>::iterator it2;
284                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
285                 
286                 return genusIndex;
287         }
288         catch(exception& e) {
289                 m->errorOut(e, "PhyloTree", "getGenusNodes");
290                 exit(1);
291         }
292 }
293 /**************************************************************************************************/
294 vector<int> PhyloTree::getGenusTotals() {
295         try {
296         
297                 if (calcTotals) {
298                         totals.clear();
299                         //reset counts because we are on a new word
300                         for (int j = 0; j < genusIndex.size(); j++) {
301                                 totals.push_back(tree[genusIndex[j]].accessions.size());
302                         }
303                         return totals;
304                 }else{
305                         return totals;
306                 }
307                 
308         }
309         catch(exception& e) {
310                 m->errorOut(e, "PhyloTree", "getGenusNodes");
311                 exit(1);
312         }
313 }
314 /**************************************************************************************************/
315
316 void PhyloTree::assignHeirarchyIDs(int index){
317         try {
318                 map<string,int>::iterator it;
319                 int counter = 1;
320                 
321                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
322                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
323                         counter++;
324                         tree[it->second].level = tree[index].level + 1;
325                                                 
326                         //save maxLevel for binning the unclassified seqs
327                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
328                         
329                         assignHeirarchyIDs(it->second);
330                 }
331         }
332         catch(exception& e) {
333                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
334                 exit(1);
335         }
336 }
337 /**************************************************************************************************/
338 void PhyloTree::setUp(string tfile){
339         try{
340                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
341                 
342                 #ifdef USE_MPI
343                         int pid;
344                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
345
346                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
347                 
348                 #else
349                         //create file needed for summary if it doesn't exist
350                         ifstream FileTest(taxFileNameTest.c_str());
351                         
352                         if (!FileTest) { 
353                                 binUnclassified(taxFileNameTest); 
354                         }
355                 #endif
356         }
357         catch(exception& e) {
358                 m->errorOut(e, "PhyloTree", "setUp");
359                 exit(1);
360         }
361 }
362 /**************************************************************************************************/
363 void PhyloTree::binUnclassified(string file){
364         try {
365         
366                 ofstream out;
367                 openOutputFile(file, out);
368                 
369                 map<string, int>::iterator itBin;
370                 map<string, int>::iterator childPointer;
371                 
372                 vector<TaxNode> copy = tree;
373                         
374                 //fill out tree
375                 fillOutTree(0, copy);
376         
377                 //get leaf nodes that may need extension
378                 for (int i = 0; i < copy.size(); i++) {  
379
380                         if (copy[i].children.size() == 0) {
381                                 leafNodes[i] = i;
382                         }
383                 }
384         //cout << "maxLevel = " << maxLevel << endl;    
385                 int copyNodes = copy.size();
386                 
387                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
388                 map<int, int>::iterator itLeaf;
389                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
390                         
391                         if (m->control_pressed) {  out.close(); break;  }
392                         
393                         int level = copy[itLeaf->second].level;
394                         int currentNode = itLeaf->second;
395                         
396                         //this sequence is unclassified at some levels
397                         while(level < maxLevel){
398                 
399                                 level++;
400                         
401                                 string taxon = "unclassified";  
402                                 
403                                 //does the parent have a child names 'unclassified'?
404                                 childPointer = copy[currentNode].children.find(taxon);
405                                 
406                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
407                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
408                                 }
409                                 else{                                                                                   //otherwise, create it
410                                         copy.push_back(TaxNode(taxon));
411                                         copyNodes++;
412                                         copy[currentNode].children[taxon] = copyNodes-1;
413                                         copy[copyNodes-1].parent = currentNode;
414                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
415                                                                         
416                                         currentNode = copy[currentNode].children[taxon];
417                                 }
418                         }
419                 }
420                 
421                 if (!m->control_pressed) {
422                         //print copy tree
423                         print(out, copy);
424                 }
425                                 
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "PhyloTree", "binUnclassified");
429                 exit(1);
430         }
431 }
432 /**************************************************************************************************/
433 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
434         try {
435                 map<string,int>::iterator it;
436                 
437                 if (copy[index].level < maxLevel) {
438                         it = copy[index].children.find("unclassified");
439                         if (it == copy[index].children.end()) { //no unclassified at this level
440                                 string taxon = "unclassified";
441                                 copy.push_back(TaxNode(taxon));
442                                 copy[index].children[taxon] = copy.size()-1;
443                                 copy[copy.size()-1].parent = index;
444                                 copy[copy.size()-1].level = copy[index].level + 1;
445                         }
446                 
447                         
448                         for(it=copy[index].children.begin();it!=copy[index].children.end();it++){ //check your children
449                                 fillOutTree(it->second, copy);
450                         }
451                         
452                 }
453                 
454         }
455         catch(exception& e) {
456                 m->errorOut(e, "PhyloTree", "fillOutTree");
457                 exit(1);
458         }
459 }
460 /**************************************************************************************************/
461 string PhyloTree::getFullTaxonomy(string seqName) {
462         try {
463                 string tax = "";
464                 
465                 int currentNode = name2Taxonomy[seqName];
466                 
467                 while (tree[currentNode].parent != -1) {
468                         tax = tree[currentNode].name + ";" + tax;
469                         currentNode = tree[currentNode].parent;
470                 }
471                 
472                 return tax;
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
476                 exit(1);
477         }
478 }
479 /**************************************************************************************************/
480
481 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
482         try {
483                 out << copy.size() << endl;
484                 
485                 for (int i = 0; i < copy.size(); i++) {
486         
487                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
488                         
489                         map<string,int>::iterator it;
490                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
491                                 out << it->first << '\t' << it->second << '\t';
492                         }
493                         out << endl;
494                 }
495                 
496                 out.close();
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "PhyloTree", "print");
500                 exit(1);
501         }
502 }
503 /**************************************************************************************************/
504 void PhyloTree::printTreeNodes(string treefilename) {
505         try {
506         
507                 #ifdef USE_MPI
508                         int pid;
509                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
510
511                         if (pid == 0) {  
512                 
513                 #endif
514
515                         ofstream outTree;
516                         openOutputFile(treefilename, outTree);
517                         
518                         //print treenodes
519                         outTree << tree.size() << endl;
520                         for (int i = 0; i < tree.size(); i++) {
521                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
522                         }
523                         
524                         //print genus nodes
525                         outTree << endl << uniqueTaxonomies.size() << endl;
526                         map<int, int>::iterator it2;
527                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
528                         outTree << endl;
529                         
530                         outTree.close();
531                 
532                 #ifdef USE_MPI
533                         }
534                 #endif
535
536                 
537         }
538         catch(exception& e) {
539                 m->errorOut(e, "PhyloTree", "printTreeNodes");
540                 exit(1);
541         }
542 }
543 /**************************************************************************************************/
544 TaxNode PhyloTree::get(int i ){
545         try {
546                 if (i < tree.size()) {  return tree[i];  }
547                 else {  cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
548         }
549         catch(exception& e) {
550                 m->errorOut(e, "PhyloTree", "get");
551                 exit(1);
552         }
553 }
554 /**************************************************************************************************/
555 TaxNode PhyloTree::get(string seqName){
556         try {
557                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
558         
559                 if (itFind != name2Taxonomy.end()) {  return tree[name2Taxonomy[seqName]];  }
560                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "PhyloTree", "get");
564                 exit(1);
565         }
566 }
567 /**************************************************************************************************/
568 string PhyloTree::getName(int i ){
569         try {
570                 if (i < tree.size()) {  return tree[i].name;     }
571                 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "PhyloTree", "get");
575                 exit(1);
576         }
577 }
578 /**************************************************************************************************/
579 int PhyloTree::getIndex(string seqName){
580         try {
581                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
582         
583                 if (itFind != name2Taxonomy.end()) {  return name2Taxonomy[seqName];  }
584                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
585         }
586         catch(exception& e) {
587                 m->errorOut(e, "PhyloTree", "get");
588                 exit(1);
589         }
590 }
591 /**************************************************************************************************/
592 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
593         try {
594         
595                 bool okay = true;
596                 
597                 map<string, int>::iterator itFind;
598                 map<string, int> taxonomyFileNames = name2Taxonomy;
599                 
600                 for (int i = 0; i < templateFileNames.size(); i++) {
601                         itFind = taxonomyFileNames.find(templateFileNames[i]);
602                         
603                         if (itFind != taxonomyFileNames.end()) { //found it so erase it
604                                 taxonomyFileNames.erase(itFind);
605                         }else {
606                                 m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
607                                 okay = false;
608                         }
609                         
610                         templateFileNames.erase(templateFileNames.begin()+i);
611                         i--;
612                 }
613                 
614                 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
615                         okay = false;
616                         
617                         for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
618                                 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
619                         }
620                 }
621                 
622                 return okay;
623         }
624         catch(exception& e) {
625                 m->errorOut(e, "PhyloTree", "ErrorCheck");
626                 exit(1);
627         }
628 }
629 /**************************************************************************************************/
630         
631
632
633