]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
e430fb9e85622dcd1d425534857fe37dff600da9
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "PhyloTree", "PhyloTree");
26                 exit(1);
27         }
28 }
29 /**************************************************************************************************/
30
31 PhyloTree::PhyloTree(ifstream& in, string filename){
32         try {
33                 m = MothurOut::getInstance();
34                 calcTotals = false;
35                 numNodes = 0;
36                 numSeqs = 0;
37                 
38                 #ifdef USE_MPI
39                         MPI_File inMPI;
40                         MPI_Offset size;
41                         MPI_Status status;
42
43                         char inFileName[1024];
44                         strcpy(inFileName, filename.c_str());
45
46                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
47                         MPI_File_get_size(inMPI, &size);
48                         
49                         char* buffer = new char[size];
50                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
51
52                         string tempBuf = buffer;
53                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
54                         istringstream iss (tempBuf,istringstream::in);
55                         delete buffer;
56                         
57                         //read version
58                         m->getline(iss); m->gobble(iss);
59                         
60                         iss >> numNodes; m->gobble(iss);
61                         
62                         tree.resize(numNodes);
63                         
64                         for (int i = 0; i < tree.size(); i++) {
65                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
66                         }
67                         
68                         //read genus nodes
69                         int numGenus = 0;
70                         iss >> numGenus; m->gobble(iss);
71                         
72                         int gnode, gsize;
73                         totals.clear();
74                         for (int i = 0; i < numGenus; i++) {
75                                 iss >> gnode >> gsize; m->gobble(iss);
76                                 
77                                 uniqueTaxonomies[gnode] = gnode;
78                                 totals.push_back(gsize);
79                         }
80                         
81                         MPI_File_close(&inMPI);
82                         
83                 #else
84                         //read version
85                         string line = m->getline(in); m->gobble(in);
86                         
87                         in >> numNodes; m->gobble(in);
88                         
89                         tree.resize(numNodes);
90                         
91                         for (int i = 0; i < tree.size(); i++) {
92                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
93                         }
94                         
95                         //read genus nodes
96                         int numGenus = 0;
97                         in >> numGenus; m->gobble(in);
98                         
99                         int gnode, gsize;
100                         totals.clear();
101                         for (int i = 0; i < numGenus; i++) {
102                                 in >> gnode >> gsize; m->gobble(in);
103                                 
104                                 uniqueTaxonomies[gnode] = gnode;
105                                 totals.push_back(gsize);
106                         }
107                         
108                         in.close();
109                         
110                 #endif
111                 
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "PhyloTree", "PhyloTree");
115                 exit(1);
116         }
117 }
118 /**************************************************************************************************/
119
120 PhyloTree::PhyloTree(string tfile){
121         try {
122                 m = MothurOut::getInstance();
123                 numNodes = 1;
124                 numSeqs = 0;
125                 tree.push_back(TaxNode("Root"));
126                 tree[0].heirarchyID = "0";
127                 maxLevel = 0;
128                 calcTotals = true;
129                 string name, tax;
130
131                 
132                 #ifdef USE_MPI
133                         int pid, num, processors;
134                         vector<unsigned long long> positions;
135                         
136                         MPI_Status status; 
137                         MPI_File inMPI;
138                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
139                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
140
141                         char inFileName[1024];
142                         strcpy(inFileName, tfile.c_str());
143
144                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
145
146                         if (pid == 0) {
147                                 positions = m->setFilePosEachLine(tfile, num);
148                                 
149                                 //send file positions to all processes
150                                 for(int i = 1; i < processors; i++) { 
151                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
152                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
153                                 }
154                         }else{
155                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
156                                 positions.resize(num+1);
157                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
158                         }
159                 
160                         //read file 
161                         for(int i=0;i<num;i++){
162                                 //read next sequence
163                                 int length = positions[i+1] - positions[i];
164                                 char* buf4 = new char[length];
165
166                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
167
168                                 string tempBuf = buf4;
169                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
170                                 delete buf4;
171
172                                 istringstream iss (tempBuf,istringstream::in);
173                                 iss >> name >> tax;
174                                 addSeqToTree(name, tax);
175                         }
176                         
177                         MPI_File_close(&inMPI);
178                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
179                 
180                 #else
181                         ifstream in;
182                         m->openInputFile(tfile, in);
183                         
184                         //read in users taxonomy file and add sequences to tree
185                         while(!in.eof()){
186                                 in >> name >> tax; m->gobble(in);
187                         
188                                 addSeqToTree(name, tax);
189                         }
190                         in.close();
191                 #endif
192         
193                 assignHeirarchyIDs(0);
194         
195                 //create file for summary if needed
196                 setUp(tfile);
197         }
198         catch(exception& e) {
199                 m->errorOut(e, "PhyloTree", "PhyloTree");
200                 exit(1);
201         }
202 }
203
204 /**************************************************************************************************/
205
206 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
207         try {
208                 string currentLevel = "";
209                 if(heirarchy != ""){
210                         int pos = heirarchy.find_first_of(';');
211                         
212                         if (pos == -1) { //you can't find another ;
213                                 currentLevel = heirarchy;
214                                 heirarchy = "";
215                                 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
216                         }else{
217                                 currentLevel=heirarchy.substr(0,pos);
218                                 if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
219                                 else { heirarchy = ""; }
220                         }
221                         
222                 }
223                 return currentLevel;
224         }
225         catch(exception& e) {
226                 m->errorOut(e, "PhyloTree", "getNextTaxon");
227                 exit(1);
228         }
229 }
230
231 /**************************************************************************************************/
232
233 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
234         try {
235                         
236                 numSeqs++;
237                 
238                 map<string, int>::iterator childPointer;
239                 
240                 int currentNode = 0;
241                 int level = 1;
242                 
243                 tree[0].accessions.push_back(seqName);
244                 m->removeConfidences(seqTaxonomy);
245                 
246                 string taxon;// = getNextTaxon(seqTaxonomy);
247         
248                 while(seqTaxonomy != ""){
249                         
250                         level++;
251                 
252                         if (m->control_pressed) { return 0; }
253                         
254                         //somehow the parent is getting one too many accnos
255                         //use print to reassign the taxa id
256                         taxon = getNextTaxon(seqTaxonomy, seqName);
257                         
258                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
259                         
260                         childPointer = tree[currentNode].children.find(taxon);
261                         
262                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
263                                 currentNode = childPointer->second;
264                                 tree[currentNode].accessions.push_back(seqName);
265                                 name2Taxonomy[seqName] = currentNode;
266                         }
267                         else{                                                                                   //otherwise, create it
268                                 tree.push_back(TaxNode(taxon));
269                                 numNodes++;
270                                 tree[currentNode].children[taxon] = numNodes-1;
271                                 tree[numNodes-1].parent = currentNode;
272                                 
273                                 currentNode = tree[currentNode].children[taxon];
274                                 tree[currentNode].accessions.push_back(seqName);
275                                 name2Taxonomy[seqName] = currentNode;
276                         }
277         
278                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
279                 }
280                 
281                 return 0;
282         }
283         catch(exception& e) {
284                 m->errorOut(e, "PhyloTree", "addSeqToTree");
285                 exit(1);
286         }
287 }
288 /**************************************************************************************************/
289 vector<int> PhyloTree::getGenusNodes()  {
290         try {
291                 genusIndex.clear();
292                 //generate genusIndexes
293                 map<int, int>::iterator it2;
294                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
295                 
296                 return genusIndex;
297         }
298         catch(exception& e) {
299                 m->errorOut(e, "PhyloTree", "getGenusNodes");
300                 exit(1);
301         }
302 }
303 /**************************************************************************************************/
304 vector<int> PhyloTree::getGenusTotals() {
305         try {
306         
307                 if (calcTotals) {
308                         totals.clear();
309                         //reset counts because we are on a new word
310                         for (int j = 0; j < genusIndex.size(); j++) {
311                                 totals.push_back(tree[genusIndex[j]].accessions.size());
312                         }
313                         return totals;
314                 }else{
315                         return totals;
316                 }
317                 
318         }
319         catch(exception& e) {
320                 m->errorOut(e, "PhyloTree", "getGenusNodes");
321                 exit(1);
322         }
323 }
324 /**************************************************************************************************/
325
326 void PhyloTree::assignHeirarchyIDs(int index){
327         try {
328                 map<string,int>::iterator it;
329                 int counter = 1;
330                 
331                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
332                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
333                         counter++;
334                         tree[it->second].level = tree[index].level + 1;
335                                                 
336                         //save maxLevel for binning the unclassified seqs
337                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
338                         
339                         assignHeirarchyIDs(it->second);
340                 }
341         }
342         catch(exception& e) {
343                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
344                 exit(1);
345         }
346 }
347 /**************************************************************************************************/
348 void PhyloTree::setUp(string tfile){
349         try{
350                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
351                 
352                 #ifdef USE_MPI
353                         int pid;
354                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
355
356                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
357                 
358                 #else
359                         binUnclassified(taxFileNameTest); 
360                 #endif
361         }
362         catch(exception& e) {
363                 m->errorOut(e, "PhyloTree", "setUp");
364                 exit(1);
365         }
366 }
367 /**************************************************************************************************/
368 void PhyloTree::binUnclassified(string file){
369         try {
370         
371                 ofstream out;
372                 m->openOutputFile(file, out);
373                 
374                 map<string, int>::iterator itBin;
375                 map<string, int>::iterator childPointer;
376                 
377                 vector<TaxNode> copy = tree;
378                         
379                 //fill out tree
380                 fillOutTree(0, copy);
381         
382                 //get leaf nodes that may need extension
383                 for (int i = 0; i < copy.size(); i++) {  
384
385                         if (copy[i].children.size() == 0) {
386                                 leafNodes[i] = i;
387                         }
388                 }
389                 
390                 int copyNodes = copy.size();
391         
392                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
393                 map<int, int>::iterator itLeaf;
394                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
395                         
396                         if (m->control_pressed) {  out.close(); break;  }
397                         
398                         int level = copy[itLeaf->second].level;
399                         int currentNode = itLeaf->second;
400                         
401                         //this sequence is unclassified at some levels
402                         while(level < maxLevel){
403                 
404                                 level++;
405                         
406                                 string taxon = "unclassified";  
407                                 
408                                 //does the parent have a child names 'unclassified'?
409                                 childPointer = copy[currentNode].children.find(taxon);
410                                 
411                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
412                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
413                                 }
414                                 else{                                                                                   //otherwise, create it
415                                         copy.push_back(TaxNode(taxon));
416                                         copyNodes++;
417                                         copy[currentNode].children[taxon] = copyNodes-1;
418                                         copy[copyNodes-1].parent = currentNode;
419                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
420                                                                         
421                                         currentNode = copy[currentNode].children[taxon];
422                                 }
423                         }
424                 }
425                 
426                 if (!m->control_pressed) {
427                         //print copy tree
428                         print(out, copy);
429                 }
430                                 
431         }
432         catch(exception& e) {
433                 m->errorOut(e, "PhyloTree", "binUnclassified");
434                 exit(1);
435         }
436 }
437 /**************************************************************************************************/
438 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
439         try {
440         
441                 map<string,int>::iterator it;
442                 
443                 it = copy[index].children.find("unclassified");
444                 if (it == copy[index].children.end()) { //no unclassified at this level
445                         string taxon = "unclassified";
446                         copy.push_back(TaxNode(taxon));
447                         copy[index].children[taxon] = copy.size()-1;
448                         copy[copy.size()-1].parent = index;
449                         copy[copy.size()-1].level = copy[index].level + 1;
450                 }
451                 
452                 if (tree[index].level < maxLevel) {
453                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
454                                 fillOutTree(it->second, copy);
455                         }
456                 }
457
458         }
459         catch(exception& e) {
460                 m->errorOut(e, "PhyloTree", "fillOutTree");
461                 exit(1);
462         }
463 }
464 /**************************************************************************************************/
465 string PhyloTree::getFullTaxonomy(string seqName) {
466         try {
467                 string tax = "";
468                 
469                 int currentNode = name2Taxonomy[seqName];
470                 
471                 while (tree[currentNode].parent != -1) {
472                         tax = tree[currentNode].name + ";" + tax;
473                         currentNode = tree[currentNode].parent;
474                 }
475                 
476                 return tax;
477         }
478         catch(exception& e) {
479                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
480                 exit(1);
481         }
482 }
483 /**************************************************************************************************/
484
485 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
486         try {
487         
488                 //output mothur version
489                 out << "#" << m->getVersion() << endl;
490                 
491                 out << copy.size() << endl;
492                 
493                 out << maxLevel << endl;
494                 
495                 for (int i = 0; i < copy.size(); i++) {
496         
497                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
498                         
499                         map<string,int>::iterator it;
500                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
501                                 out << it->first << '\t' << it->second << '\t';
502                         }
503                         out << endl;
504                 }
505                 
506                 out.close();
507         }
508         catch(exception& e) {
509                 m->errorOut(e, "PhyloTree", "print");
510                 exit(1);
511         }
512 }
513 /**************************************************************************************************/
514 void PhyloTree::printTreeNodes(string treefilename) {
515         try {
516         
517                 #ifdef USE_MPI
518                         int pid;
519                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
520
521                         if (pid == 0) {  
522                 
523                 #endif
524
525                         ofstream outTree;
526                         m->openOutputFile(treefilename, outTree);
527                         
528                         //output mothur version
529                         outTree << "#" << m->getVersion() << endl;
530                         
531                         //print treenodes
532                         outTree << tree.size() << endl;
533                         for (int i = 0; i < tree.size(); i++) {
534                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
535                         }
536                         
537                         //print genus nodes
538                         outTree << endl << uniqueTaxonomies.size() << endl;
539                         map<int, int>::iterator it2;
540                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
541                         outTree << endl;
542                         
543                         outTree.close();
544                 
545                 #ifdef USE_MPI
546                         }
547                 #endif
548
549                 
550         }
551         catch(exception& e) {
552                 m->errorOut(e, "PhyloTree", "printTreeNodes");
553                 exit(1);
554         }
555 }
556 /**************************************************************************************************/
557 TaxNode PhyloTree::get(int i ){
558         try {
559                 if (i < tree.size()) {  return tree[i];  }
560                 else {  cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "PhyloTree", "get");
564                 exit(1);
565         }
566 }
567 /**************************************************************************************************/
568 TaxNode PhyloTree::get(string seqName){
569         try {
570                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
571         
572                 if (itFind != name2Taxonomy.end()) {  return tree[name2Taxonomy[seqName]];  }
573                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
574         }
575         catch(exception& e) {
576                 m->errorOut(e, "PhyloTree", "get");
577                 exit(1);
578         }
579 }
580 /**************************************************************************************************/
581 string PhyloTree::getName(int i ){
582         try {
583                 if (i < tree.size()) {  return tree[i].name;     }
584                 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
585         }
586         catch(exception& e) {
587                 m->errorOut(e, "PhyloTree", "get");
588                 exit(1);
589         }
590 }
591 /**************************************************************************************************/
592 int PhyloTree::getIndex(string seqName){
593         try {
594                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
595         
596                 if (itFind != name2Taxonomy.end()) {  return name2Taxonomy[seqName];  }
597                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
598         }
599         catch(exception& e) {
600                 m->errorOut(e, "PhyloTree", "get");
601                 exit(1);
602         }
603 }
604 /**************************************************************************************************/
605 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
606         try {
607         
608                 bool okay = true;
609                 
610                 map<string, int>::iterator itFind;
611                 map<string, int> taxonomyFileNames = name2Taxonomy;
612                 
613                 for (int i = 0; i < templateFileNames.size(); i++) {
614                         itFind = taxonomyFileNames.find(templateFileNames[i]);
615                         
616                         if (itFind != taxonomyFileNames.end()) { //found it so erase it
617                                 taxonomyFileNames.erase(itFind);
618                         }else {
619                                 m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
620                                 okay = false;
621                         }
622                         
623                         //templateFileNames.erase(templateFileNames.begin()+i);
624                         //i--;
625                 }
626                 templateFileNames.clear();
627                 
628                 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
629                         okay = false;
630                         
631                         for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
632                                 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
633                         }
634                 }
635                 
636                 return okay;
637         }
638         catch(exception& e) {
639                 m->errorOut(e, "PhyloTree", "ErrorCheck");
640                 exit(1);
641         }
642 }
643 /**************************************************************************************************/
644         
645
646
647