]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
filled out reference taxonomy with "unclassified" so that if a cutoff is used the...
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "PhyloTree", "PhyloTree");
26                 exit(1);
27         }
28 }
29 /**************************************************************************************************/
30
31 PhyloTree::PhyloTree(ifstream& in, string filename){
32         try {
33                 m = MothurOut::getInstance();
34                 calcTotals = false;
35                 
36                 #ifdef USE_MPI
37                         MPI_File inMPI;
38                         MPI_Offset size;
39                         MPI_Status status;
40
41                         char inFileName[1024];
42                         strcpy(inFileName, filename.c_str());
43
44                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
45                         MPI_File_get_size(inMPI, &size);
46                         
47                         char* buffer = new char[size];
48                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
49
50                         string tempBuf = buffer;
51                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
52                         istringstream iss (tempBuf,istringstream::in);
53                         delete buffer;
54                         
55                         iss >> numNodes; gobble(iss);
56                         
57                         tree.resize(numNodes);
58                         
59                         for (int i = 0; i < tree.size(); i++) {
60                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
61                         }
62                         
63                         //read genus nodes
64                         int numGenus = 0;
65                         iss >> numGenus; gobble(iss);
66                         
67                         int gnode, gsize;
68                         totals.clear();
69                         for (int i = 0; i < numGenus; i++) {
70                                 iss >> gnode >> gsize; gobble(iss);
71                                 
72                                 uniqueTaxonomies[gnode] = gnode;
73                                 totals.push_back(gsize);
74                         }
75                         
76                         MPI_File_close(&inMPI);
77                         
78                 #else
79                         in >> numNodes; gobble(in);
80                         
81                         tree.resize(numNodes);
82                         
83                         for (int i = 0; i < tree.size(); i++) {
84                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
85                         }
86                         
87                         //read genus nodes
88                         int numGenus = 0;
89                         in >> numGenus; gobble(in);
90                         
91                         int gnode, gsize;
92                         totals.clear();
93                         for (int i = 0; i < numGenus; i++) {
94                                 in >> gnode >> gsize; gobble(in);
95                                 
96                                 uniqueTaxonomies[gnode] = gnode;
97                                 totals.push_back(gsize);
98                         }
99                         
100                         in.close();
101                         
102                 #endif
103                 
104         }
105         catch(exception& e) {
106                 m->errorOut(e, "PhyloTree", "PhyloTree");
107                 exit(1);
108         }
109 }
110 /**************************************************************************************************/
111
112 PhyloTree::PhyloTree(string tfile){
113         try {
114                 m = MothurOut::getInstance();
115                 numNodes = 1;
116                 numSeqs = 0;
117                 tree.push_back(TaxNode("Root"));
118                 tree[0].heirarchyID = "0";
119                 maxLevel = 0;
120                 calcTotals = true;
121                 string name, tax;
122
123                 
124                 #ifdef USE_MPI
125                         int pid, num;
126                         vector<long> positions;
127                         
128                         MPI_Status status; 
129                         MPI_File inMPI;
130                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
131
132                         char inFileName[1024];
133                         strcpy(inFileName, tfile.c_str());
134
135                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
136
137                         if (pid == 0) {
138                                 positions = setFilePosEachLine(tfile, num);
139                                 
140                                 //send file positions to all processes
141                                 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
142                                 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
143                         }else{
144                                 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
145                                 positions.resize(num);
146                                 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
147                         }
148                 
149                         //read file 
150                         for(int i=0;i<num;i++){
151                                 //read next sequence
152                                 int length = positions[i+1] - positions[i];
153                                 char* buf4 = new char[length];
154
155                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
156
157                                 string tempBuf = buf4;
158                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
159                                 delete buf4;
160
161                                 istringstream iss (tempBuf,istringstream::in);
162                                 iss >> name >> tax;
163                                 addSeqToTree(name, tax);
164                         }
165                         
166                         MPI_File_close(&inMPI);
167                 
168                 #else
169                         ifstream in;
170                         openInputFile(tfile, in);
171                         
172                         //read in users taxonomy file and add sequences to tree
173                         while(!in.eof()){
174                                 in >> name >> tax; gobble(in);
175                                 
176                                 addSeqToTree(name, tax);
177                         }
178                         in.close();
179                 #endif
180                 
181                 assignHeirarchyIDs(0);
182                 
183                 //create file for summary if needed
184                 setUp(tfile);
185         }
186         catch(exception& e) {
187                 m->errorOut(e, "PhyloTree", "PhyloTree");
188                 exit(1);
189         }
190 }
191
192 /**************************************************************************************************/
193
194 string PhyloTree::getNextTaxon(string& heirarchy){
195         try {
196                 string currentLevel = "";
197                 if(heirarchy != ""){
198                         int pos = heirarchy.find_first_of(';');
199                         currentLevel=heirarchy.substr(0,pos);
200                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
201                         else { heirarchy = ""; }
202                 }
203                 return currentLevel;
204         }
205         catch(exception& e) {
206                 m->errorOut(e, "PhyloTree", "getNextTaxon");
207                 exit(1);
208         }
209 }
210
211 /**************************************************************************************************/
212
213 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
214         try {
215                 numSeqs++;
216                 
217                 map<string, int>::iterator childPointer;
218                 
219                 int currentNode = 0;
220                 int level = 1;
221                 
222                 tree[0].accessions.push_back(seqName);
223                 string taxon;// = getNextTaxon(seqTaxonomy);
224                 
225                 while(seqTaxonomy != ""){
226                         
227                         level++;
228                         
229                         if (m->control_pressed) { return 0; }
230                         
231                         //somehow the parent is getting one too many accnos
232                         //use print to reassign the taxa id
233                         taxon = getNextTaxon(seqTaxonomy);
234                         
235                         childPointer = tree[currentNode].children.find(taxon);
236                         
237                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
238                                 currentNode = childPointer->second;
239                                 tree[currentNode].accessions.push_back(seqName);
240                                 name2Taxonomy[seqName] = currentNode;
241                         }
242                         else{                                                                                   //otherwise, create it
243                                 tree.push_back(TaxNode(taxon));
244                                 numNodes++;
245                                 tree[currentNode].children[taxon] = numNodes-1;
246                                 tree[numNodes-1].parent = currentNode;
247                                 
248                                 //                      int numChildren = tree[currentNode].children.size();
249                                 //                      string heirarchyID = tree[currentNode].heirarchyID;
250                                 //                      tree[currentNode].accessions.push_back(seqName);
251                                 
252                                 currentNode = tree[currentNode].children[taxon];
253                                 tree[currentNode].accessions.push_back(seqName);
254                                 name2Taxonomy[seqName] = currentNode;
255                                 //                      tree[currentNode].level = level;
256                                 //                      tree[currentNode].childNumber = numChildren;
257                                 //                      tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
258                         }
259                 
260                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
261                 }
262
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "PhyloTree", "addSeqToTree");
266                 exit(1);
267         }
268 }
269 /**************************************************************************************************/
270 vector<int> PhyloTree::getGenusNodes()  {
271         try {
272                 genusIndex.clear();
273                 //generate genusIndexes
274                 map<int, int>::iterator it2;
275                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
276                 
277                 return genusIndex;
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "PhyloTree", "getGenusNodes");
281                 exit(1);
282         }
283 }
284 /**************************************************************************************************/
285 vector<int> PhyloTree::getGenusTotals() {
286         try {
287         
288                 if (calcTotals) {
289                         totals.clear();
290                         //reset counts because we are on a new word
291                         for (int j = 0; j < genusIndex.size(); j++) {
292                                 totals.push_back(tree[genusIndex[j]].accessions.size());
293                         }
294                         return totals;
295                 }else{
296                         return totals;
297                 }
298                 
299         }
300         catch(exception& e) {
301                 m->errorOut(e, "PhyloTree", "getGenusNodes");
302                 exit(1);
303         }
304 }
305 /**************************************************************************************************/
306
307 void PhyloTree::assignHeirarchyIDs(int index){
308         try {
309                 map<string,int>::iterator it;
310                 int counter = 1;
311                 
312                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
313                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
314                         counter++;
315                         tree[it->second].level = tree[index].level + 1;
316                                                 
317                         //save maxLevel for binning the unclassified seqs
318                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
319                         
320                         assignHeirarchyIDs(it->second);
321                 }
322         }
323         catch(exception& e) {
324                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
325                 exit(1);
326         }
327 }
328 /**************************************************************************************************/
329 void PhyloTree::setUp(string tfile){
330         try{
331                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
332                 
333                 #ifdef USE_MPI
334                         int pid;
335                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
336
337                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
338                 
339                 #else
340                         //create file needed for summary if it doesn't exist
341                         ifstream FileTest(taxFileNameTest.c_str());
342                         
343                         if (!FileTest) { 
344                                 binUnclassified(taxFileNameTest); 
345                         }
346                 #endif
347         }
348         catch(exception& e) {
349                 m->errorOut(e, "PhyloTree", "setUp");
350                 exit(1);
351         }
352 }
353 /**************************************************************************************************/
354 void PhyloTree::binUnclassified(string file){
355         try {
356         
357                 ofstream out;
358                 openOutputFile(file, out);
359                 
360                 map<string, int>::iterator itBin;
361                 map<string, int>::iterator childPointer;
362                 
363                 vector<TaxNode> copy = tree;
364                                 
365                 //fill out tree
366                 fillOutTree(0, copy);
367                 
368                 //get leaf nodes that may need externsion
369                 for (int i = 0; i < copy.size(); i++) {  
370
371                         if (copy[i].children.size() == 0) {
372                                 leafNodes[i] = i;
373                         }
374                 }
375                 
376                 int copyNodes = copy.size();
377                 
378                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
379                 map<int, int>::iterator itLeaf;
380                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
381                         
382                         if (m->control_pressed) {  out.close(); break;  }
383                         
384                         int level = copy[itLeaf->second].level;
385                         int currentNode = itLeaf->second;
386                         
387                         //this sequence is unclassified at some levels
388                         while(level <= maxLevel){
389                 
390                                 level++;
391                         
392                                 string taxon = "unclassified";  
393                                 
394                                 //does the parent have a child names 'unclassified'?
395                                 childPointer = copy[currentNode].children.find(taxon);
396                                 
397                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
398                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
399                                 }
400                                 else{                                                                                   //otherwise, create it
401                                         copy.push_back(TaxNode(taxon));
402                                         copyNodes++;
403                                         copy[currentNode].children[taxon] = copyNodes-1;
404                                         copy[copyNodes-1].parent = currentNode;
405                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
406                                                                         
407                                         currentNode = copy[currentNode].children[taxon];
408                                 }
409                         }
410                 }
411                 
412                 if (!m->control_pressed) {
413                         //print copy tree
414                         print(out, copy);
415                 }
416                                 
417         }
418         catch(exception& e) {
419                 m->errorOut(e, "PhyloTree", "binUnclassified");
420                 exit(1);
421         }
422 }
423 /**************************************************************************************************/
424 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
425         try {
426                 map<string,int>::iterator it;
427                 
428                 it = copy[index].children.find("unclassified");
429                 if (it == copy[index].children.end()) { //no unclassified at this level
430                         string taxon = "unclassified";
431                         copy.push_back(TaxNode(taxon));
432                         copy[index].children[taxon] = copy.size()-1;
433                         copy[copy.size()-1].parent = index;
434                         copy[copy.size()-1].level = copy[index].level + 1;
435                 }
436                 
437                 if (tree[index].level <= maxLevel) {
438                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
439                                 fillOutTree(it->second, copy);
440                         }
441                 }
442                 
443         }
444         catch(exception& e) {
445                 m->errorOut(e, "PhyloTree", "fillOutTree");
446                 exit(1);
447         }
448 }
449 /**************************************************************************************************/
450 string PhyloTree::getFullTaxonomy(string seqName) {
451         try {
452                 string tax = "";
453                 
454                 int currentNode = name2Taxonomy[seqName];
455                 
456                 while (tree[currentNode].parent != -1) {
457                         tax = tree[currentNode].name + ";" + tax;
458                         currentNode = tree[currentNode].parent;
459                 }
460                 
461                 return tax;
462         }
463         catch(exception& e) {
464                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
465                 exit(1);
466         }
467 }
468 /**************************************************************************************************/
469
470 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
471         try {
472                 out << copy.size() << endl;
473                 
474                 for (int i = 0; i < copy.size(); i++) {
475         
476                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
477                         
478                         map<string,int>::iterator it;
479                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
480                                 out << it->first << '\t' << it->second << '\t';
481                         }
482                         out << endl;
483                 }
484                 
485                 out.close();
486         }
487         catch(exception& e) {
488                 m->errorOut(e, "PhyloTree", "print");
489                 exit(1);
490         }
491 }
492 /**************************************************************************************************/
493 void PhyloTree::printTreeNodes(string treefilename) {
494         try {
495         
496                 #ifdef USE_MPI
497                         int pid;
498                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
499
500                         if (pid == 0) {  
501                 
502                 #endif
503
504                         ofstream outTree;
505                         openOutputFile(treefilename, outTree);
506                         
507                         //print treenodes
508                         outTree << tree.size() << endl;
509                         for (int i = 0; i < tree.size(); i++) {
510                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
511                         }
512                         
513                         //print genus nodes
514                         outTree << endl << uniqueTaxonomies.size() << endl;
515                         map<int, int>::iterator it2;
516                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
517                         outTree << endl;
518                         
519                         
520                         outTree.close();
521                 
522                 #ifdef USE_MPI
523                         }
524                 #endif
525
526                 
527         }
528         catch(exception& e) {
529                 m->errorOut(e, "PhyloTree", "printTreeNodes");
530                 exit(1);
531         }
532 }
533 /**************************************************************************************************/
534
535
536