]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
fixed bug with shhh.flow from file path name in write functions, added "smart" featur...
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23                 addSeqToTree("unknown", "unknown;");
24         }
25         catch(exception& e) {
26                 m->errorOut(e, "PhyloTree", "PhyloTree");
27                 exit(1);
28         }
29 }
30 /**************************************************************************************************/
31
32 PhyloTree::PhyloTree(ifstream& in, string filename){
33         try {
34                 m = MothurOut::getInstance();
35                 calcTotals = false;
36                 numNodes = 0;
37                 numSeqs = 0;
38                 
39                 #ifdef USE_MPI
40                         MPI_File inMPI;
41                         MPI_Offset size;
42                         MPI_Status status;
43
44                         char inFileName[1024];
45                         strcpy(inFileName, filename.c_str());
46
47                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
48                         MPI_File_get_size(inMPI, &size);
49                         
50                         char* buffer = new char[size];
51                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
52
53                         string tempBuf = buffer;
54                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
55                         istringstream iss (tempBuf,istringstream::in);
56                         delete buffer;
57                         
58                         //read version
59                         m->getline(iss); m->gobble(iss);
60                         
61                         iss >> numNodes; m->gobble(iss);
62                         
63                         tree.resize(numNodes);
64                         
65                         for (int i = 0; i < tree.size(); i++) {
66                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
67                         }
68                         
69                         //read genus nodes
70                         int numGenus = 0;
71                         iss >> numGenus; m->gobble(iss);
72                         
73                         int gnode, gsize;
74                         totals.clear();
75                         for (int i = 0; i < numGenus; i++) {
76                                 iss >> gnode >> gsize; m->gobble(iss);
77                                 
78                                 uniqueTaxonomies[gnode] = gnode;
79                                 totals.push_back(gsize);
80                         }
81                         
82                         MPI_File_close(&inMPI);
83                         
84                 #else
85                         //read version
86                         string line = m->getline(in); m->gobble(in);
87                         
88                         in >> numNodes; m->gobble(in);
89                         
90                         tree.resize(numNodes);
91                         
92                         for (int i = 0; i < tree.size(); i++) {
93                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
94                         }
95                         
96                         //read genus nodes
97                         int numGenus = 0;
98                         in >> numGenus; m->gobble(in);
99                         
100                         int gnode, gsize;
101                         totals.clear();
102                         for (int i = 0; i < numGenus; i++) {
103                                 in >> gnode >> gsize; m->gobble(in);
104                                 
105                                 uniqueTaxonomies[gnode] = gnode;
106                                 totals.push_back(gsize);
107                         }
108                         
109                         in.close();
110                         
111                 #endif
112                 
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "PhyloTree", "PhyloTree");
116                 exit(1);
117         }
118 }
119 /**************************************************************************************************/
120
121 PhyloTree::PhyloTree(string tfile){
122         try {
123                 m = MothurOut::getInstance();
124                 numNodes = 1;
125                 numSeqs = 0;
126                 tree.push_back(TaxNode("Root"));
127                 tree[0].heirarchyID = "0";
128                 maxLevel = 0;
129                 calcTotals = true;
130                 string name, tax;
131                 addSeqToTree("unknown", "unknown;");
132
133                 
134                 #ifdef USE_MPI
135                         int pid, num, processors;
136                         vector<unsigned long long> positions;
137                         
138                         MPI_Status status; 
139                         MPI_File inMPI;
140                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
141                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
142
143                         char inFileName[1024];
144                         strcpy(inFileName, tfile.c_str());
145
146                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
147
148                         if (pid == 0) {
149                                 positions = m->setFilePosEachLine(tfile, num);
150                                 
151                                 //send file positions to all processes
152                                 for(int i = 1; i < processors; i++) { 
153                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
154                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
155                                 }
156                         }else{
157                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
158                                 positions.resize(num+1);
159                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
160                         }
161                 
162                         //read file 
163                         for(int i=0;i<num;i++){
164                                 //read next sequence
165                                 int length = positions[i+1] - positions[i];
166                                 char* buf4 = new char[length];
167
168                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
169
170                                 string tempBuf = buf4;
171                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
172                                 delete buf4;
173
174                                 istringstream iss (tempBuf,istringstream::in);
175                                 iss >> name >> tax;
176                                 addSeqToTree(name, tax);
177                         }
178                         
179                         MPI_File_close(&inMPI);
180                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
181                 
182                 #else
183                         ifstream in;
184                         m->openInputFile(tfile, in);
185                         
186                         //read in users taxonomy file and add sequences to tree
187                         while(!in.eof()){
188                                 in >> name >> tax; m->gobble(in);
189                         
190                                 addSeqToTree(name, tax);
191                         }
192                         in.close();
193                 #endif
194         
195                 assignHeirarchyIDs(0);
196         
197                 //create file for summary if needed
198                 setUp(tfile);
199         }
200         catch(exception& e) {
201                 m->errorOut(e, "PhyloTree", "PhyloTree");
202                 exit(1);
203         }
204 }
205
206 /**************************************************************************************************/
207
208 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
209         try {
210                 string currentLevel = "";
211                 if(heirarchy != ""){
212                         int pos = heirarchy.find_first_of(';');
213                         
214                         if (pos == -1) { //you can't find another ;
215                                 currentLevel = heirarchy;
216                                 heirarchy = "";
217                                 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
218                         }else{
219                                 currentLevel=heirarchy.substr(0,pos);
220                                 if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
221                                 else { heirarchy = ""; }
222                         }
223                         
224                 }
225                 return currentLevel;
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "PhyloTree", "getNextTaxon");
229                 exit(1);
230         }
231 }
232
233 /**************************************************************************************************/
234
235 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
236         try {
237                 numSeqs++;
238                 
239                 map<string, int>::iterator childPointer;
240                 
241                 int currentNode = 0;
242                 int level = 1;
243                 
244                 tree[0].accessions.push_back(seqName);
245                 m->removeConfidences(seqTaxonomy);
246                 
247                 string taxon;// = getNextTaxon(seqTaxonomy);
248         
249                 while(seqTaxonomy != ""){
250                         
251                         level++;
252                 
253                         if (m->control_pressed) { return 0; }
254                         
255                         //somehow the parent is getting one too many accnos
256                         //use print to reassign the taxa id
257                         taxon = getNextTaxon(seqTaxonomy, seqName);
258                         
259                         if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
260                         
261                         childPointer = tree[currentNode].children.find(taxon);
262                         
263                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
264                                 currentNode = childPointer->second;
265                                 tree[currentNode].accessions.push_back(seqName);
266                                 name2Taxonomy[seqName] = currentNode;
267                         }
268                         else{                                                                                   //otherwise, create it
269                                 tree.push_back(TaxNode(taxon));
270                                 numNodes++;
271                                 tree[currentNode].children[taxon] = numNodes-1;
272                                 tree[numNodes-1].parent = currentNode;
273                                 
274                                 currentNode = tree[currentNode].children[taxon];
275                                 tree[currentNode].accessions.push_back(seqName);
276                                 name2Taxonomy[seqName] = currentNode;
277                         }
278         
279                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
280                 }
281                 
282                 return 0;
283         }
284         catch(exception& e) {
285                 m->errorOut(e, "PhyloTree", "addSeqToTree");
286                 exit(1);
287         }
288 }
289 /**************************************************************************************************/
290 vector<int> PhyloTree::getGenusNodes()  {
291         try {
292                 genusIndex.clear();
293                 //generate genusIndexes
294                 map<int, int>::iterator it2;
295                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
296                 
297                 return genusIndex;
298         }
299         catch(exception& e) {
300                 m->errorOut(e, "PhyloTree", "getGenusNodes");
301                 exit(1);
302         }
303 }
304 /**************************************************************************************************/
305 vector<int> PhyloTree::getGenusTotals() {
306         try {
307         
308                 if (calcTotals) {
309                         totals.clear();
310                         //reset counts because we are on a new word
311                         for (int j = 0; j < genusIndex.size(); j++) {
312                                 totals.push_back(tree[genusIndex[j]].accessions.size());
313                         }
314                         return totals;
315                 }else{
316                         return totals;
317                 }
318                 
319         }
320         catch(exception& e) {
321                 m->errorOut(e, "PhyloTree", "getGenusNodes");
322                 exit(1);
323         }
324 }
325 /**************************************************************************************************/
326
327 void PhyloTree::assignHeirarchyIDs(int index){
328         try {
329                 map<string,int>::iterator it;
330                 int counter = 1;
331                 
332                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
333                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
334                         counter++;
335                         tree[it->second].level = tree[index].level + 1;
336                                                 
337                         //save maxLevel for binning the unclassified seqs
338                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
339                         
340                         assignHeirarchyIDs(it->second);
341                 }
342         }
343         catch(exception& e) {
344                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
345                 exit(1);
346         }
347 }
348 /**************************************************************************************************/
349 void PhyloTree::setUp(string tfile){
350         try{
351                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
352                 
353                 #ifdef USE_MPI
354                         int pid;
355                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
356
357                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
358                 
359                 #else
360                         binUnclassified(taxFileNameTest); 
361                 #endif
362         }
363         catch(exception& e) {
364                 m->errorOut(e, "PhyloTree", "setUp");
365                 exit(1);
366         }
367 }
368 /**************************************************************************************************/
369 void PhyloTree::binUnclassified(string file){
370         try {
371         
372                 ofstream out;
373                 m->openOutputFile(file, out);
374                 
375                 map<string, int>::iterator itBin;
376                 map<string, int>::iterator childPointer;
377                 
378                 vector<TaxNode> copy = tree;
379                 
380                 //fill out tree
381                 fillOutTree(0, copy);
382         
383                 //get leaf nodes that may need extension
384                 for (int i = 0; i < copy.size(); i++) {  
385
386                         if (copy[i].children.size() == 0) {
387                                 leafNodes[i] = i;
388                         }
389                 }
390                 
391                 int copyNodes = copy.size();
392         
393                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
394                 map<int, int>::iterator itLeaf;
395                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
396                         
397                         if (m->control_pressed) {  out.close(); break;  }
398                         
399                         int level = copy[itLeaf->second].level;
400                         int currentNode = itLeaf->second;
401                         
402                         //this sequence is unclassified at some levels
403                         while(level < maxLevel){
404                 
405                                 level++;
406                         
407                                 string taxon = "unclassified";  
408                                 
409                                 //does the parent have a child names 'unclassified'?
410                                 childPointer = copy[currentNode].children.find(taxon);
411                                 
412                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
413                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
414                                 }
415                                 else{                                                                                   //otherwise, create it
416                                         copy.push_back(TaxNode(taxon));
417                                         copyNodes++;
418                                         copy[currentNode].children[taxon] = copyNodes-1;
419                                         copy[copyNodes-1].parent = currentNode;
420                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
421                                                                         
422                                         currentNode = copy[currentNode].children[taxon];
423                                 }
424                         }
425                 }
426                 
427                 if (!m->control_pressed) {
428                         //print copy tree
429                         print(out, copy);
430                 }
431                                 
432         }
433         catch(exception& e) {
434                 m->errorOut(e, "PhyloTree", "binUnclassified");
435                 exit(1);
436         }
437 }
438 /**************************************************************************************************/
439 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
440         try {
441         
442                 map<string,int>::iterator it;
443                 
444                 it = copy[index].children.find("unclassified");
445                 if (it == copy[index].children.end()) { //no unclassified at this level
446                         string taxon = "unclassified";
447                         copy.push_back(TaxNode(taxon));
448                         copy[index].children[taxon] = copy.size()-1;
449                         copy[copy.size()-1].parent = index;
450                         copy[copy.size()-1].level = copy[index].level + 1;
451                 }
452                 
453                 if (tree[index].level < maxLevel) {
454                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
455                                 fillOutTree(it->second, copy);
456                         }
457                 }
458
459         }
460         catch(exception& e) {
461                 m->errorOut(e, "PhyloTree", "fillOutTree");
462                 exit(1);
463         }
464 }
465 /**************************************************************************************************/
466 string PhyloTree::getFullTaxonomy(string seqName) {
467         try {
468                 string tax = "";
469                 
470                 int currentNode = name2Taxonomy[seqName];
471                 
472                 while (tree[currentNode].parent != -1) {
473                         tax = tree[currentNode].name + ";" + tax;
474                         currentNode = tree[currentNode].parent;
475                 }
476                 
477                 return tax;
478         }
479         catch(exception& e) {
480                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
481                 exit(1);
482         }
483 }
484 /**************************************************************************************************/
485
486 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
487         try {
488                 
489                 //output mothur version
490                 out << "#" << m->getVersion() << endl;
491                 
492                 out << copy.size() << endl;
493                 
494                 out << maxLevel << endl;
495                                 
496                 for (int i = 0; i < copy.size(); i++) {
497                                 
498                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
499                         
500                         map<string,int>::iterator it;
501                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
502                                 out << it->first << '\t' << it->second << '\t';
503                         }
504                         out << endl;
505                 }
506                 
507                 out.close();
508         }
509         catch(exception& e) {
510                 m->errorOut(e, "PhyloTree", "print");
511                 exit(1);
512         }
513 }
514 /**************************************************************************************************/
515 void PhyloTree::printTreeNodes(string treefilename) {
516         try {
517         
518                 #ifdef USE_MPI
519                         int pid;
520                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
521
522                         if (pid == 0) {  
523                 
524                 #endif
525
526                         ofstream outTree;
527                         m->openOutputFile(treefilename, outTree);
528                         
529                         //output mothur version
530                         outTree << "#" << m->getVersion() << endl;
531                         
532                         //print treenodes
533                         outTree << tree.size() << endl;
534                         for (int i = 0; i < tree.size(); i++) {
535                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
536                         }
537                         
538                         //print genus nodes
539                         outTree << endl << uniqueTaxonomies.size() << endl;
540                         map<int, int>::iterator it2;
541                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
542                         outTree << endl;
543                         
544                         outTree.close();
545                 
546                 #ifdef USE_MPI
547                         }
548                 #endif
549
550                 
551         }
552         catch(exception& e) {
553                 m->errorOut(e, "PhyloTree", "printTreeNodes");
554                 exit(1);
555         }
556 }
557 /**************************************************************************************************/
558 TaxNode PhyloTree::get(int i ){
559         try {
560                 if (i < tree.size()) {  return tree[i];  }
561                 else {  cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
562         }
563         catch(exception& e) {
564                 m->errorOut(e, "PhyloTree", "get");
565                 exit(1);
566         }
567 }
568 /**************************************************************************************************/
569 TaxNode PhyloTree::get(string seqName){
570         try {
571                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
572         
573                 if (itFind != name2Taxonomy.end()) {  return tree[name2Taxonomy[seqName]];  }
574                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
575         }
576         catch(exception& e) {
577                 m->errorOut(e, "PhyloTree", "get");
578                 exit(1);
579         }
580 }
581 /**************************************************************************************************/
582 string PhyloTree::getName(int i ){
583         try {
584                 if (i < tree.size()) {  return tree[i].name;     }
585                 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
586         }
587         catch(exception& e) {
588                 m->errorOut(e, "PhyloTree", "get");
589                 exit(1);
590         }
591 }
592 /**************************************************************************************************/
593 int PhyloTree::getIndex(string seqName){
594         try {
595                 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
596         
597                 if (itFind != name2Taxonomy.end()) {  return name2Taxonomy[seqName];  }
598                 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "PhyloTree", "get");
602                 exit(1);
603         }
604 }
605 /**************************************************************************************************/
606 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
607         try {
608         
609                 bool okay = true;
610                 templateFileNames.push_back("unknown");
611                 
612                 map<string, int>::iterator itFind;
613                 map<string, int> taxonomyFileNames = name2Taxonomy;
614                 
615                 for (int i = 0; i < templateFileNames.size(); i++) {
616                         itFind = taxonomyFileNames.find(templateFileNames[i]);
617                         
618                         if (itFind != taxonomyFileNames.end()) { //found it so erase it
619                                 taxonomyFileNames.erase(itFind);
620                         }else {
621                                 m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
622                                 okay = false;
623                         }
624                         
625                         //templateFileNames.erase(templateFileNames.begin()+i);
626                         //i--;
627                 }
628                 templateFileNames.clear();
629                 
630                 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
631                         okay = false;
632                         
633                         for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
634                                 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
635                         }
636                 }
637                 
638                 return okay;
639         }
640         catch(exception& e) {
641                 m->errorOut(e, "PhyloTree", "ErrorCheck");
642                 exit(1);
643         }
644 }
645 /**************************************************************************************************/
646         
647
648
649