]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
added cluster.split command
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "PhyloTree", "PhyloTree");
26                 exit(1);
27         }
28 }
29 /**************************************************************************************************/
30
31 PhyloTree::PhyloTree(ifstream& in, string filename){
32         try {
33                 m = MothurOut::getInstance();
34                 calcTotals = false;
35                 
36                 #ifdef USE_MPI
37                         MPI_File inMPI;
38                         MPI_Offset size;
39                         MPI_Status status;
40
41                         char inFileName[1024];
42                         strcpy(inFileName, filename.c_str());
43
44                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
45                         MPI_File_get_size(inMPI, &size);
46                         
47                         char* buffer = new char[size];
48                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
49
50                         string tempBuf = buffer;
51                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
52                         istringstream iss (tempBuf,istringstream::in);
53                         delete buffer;
54                         
55                         iss >> numNodes; gobble(iss);
56                         
57                         tree.resize(numNodes);
58                         
59                         for (int i = 0; i < tree.size(); i++) {
60                                 iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
61                         }
62                         
63                         //read genus nodes
64                         int numGenus = 0;
65                         iss >> numGenus; gobble(iss);
66                         
67                         int gnode, gsize;
68                         totals.clear();
69                         for (int i = 0; i < numGenus; i++) {
70                                 iss >> gnode >> gsize; gobble(iss);
71                                 
72                                 uniqueTaxonomies[gnode] = gnode;
73                                 totals.push_back(gsize);
74                         }
75                         
76                         MPI_File_close(&inMPI);
77                         
78                 #else
79                         in >> numNodes; gobble(in);
80                         
81                         tree.resize(numNodes);
82                         
83                         for (int i = 0; i < tree.size(); i++) {
84                                 in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
85                         }
86                         
87                         //read genus nodes
88                         int numGenus = 0;
89                         in >> numGenus; gobble(in);
90                         
91                         int gnode, gsize;
92                         totals.clear();
93                         for (int i = 0; i < numGenus; i++) {
94                                 in >> gnode >> gsize; gobble(in);
95                                 
96                                 uniqueTaxonomies[gnode] = gnode;
97                                 totals.push_back(gsize);
98                         }
99                         
100                         in.close();
101                         
102                 #endif
103                 
104         }
105         catch(exception& e) {
106                 m->errorOut(e, "PhyloTree", "PhyloTree");
107                 exit(1);
108         }
109 }
110 /**************************************************************************************************/
111
112 PhyloTree::PhyloTree(string tfile){
113         try {
114                 m = MothurOut::getInstance();
115                 numNodes = 1;
116                 numSeqs = 0;
117                 tree.push_back(TaxNode("Root"));
118                 tree[0].heirarchyID = "0";
119                 maxLevel = 0;
120                 calcTotals = true;
121                 string name, tax;
122
123                 
124                 #ifdef USE_MPI
125                         int pid, num, processors;
126                         vector<long> positions;
127                         
128                         MPI_Status status; 
129                         MPI_File inMPI;
130                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
131                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
132
133                         char inFileName[1024];
134                         strcpy(inFileName, tfile.c_str());
135
136                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
137
138                         if (pid == 0) {
139                                 positions = setFilePosEachLine(tfile, num);
140                                 
141                                 //send file positions to all processes
142                                 for(int i = 1; i < processors; i++) { 
143                                         MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
144                                         MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
145                                 }
146                         }else{
147                                 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
148                                 positions.resize(num+1);
149                                 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
150                         }
151                 
152                         //read file 
153                         for(int i=0;i<num;i++){
154                                 //read next sequence
155                                 int length = positions[i+1] - positions[i];
156                                 char* buf4 = new char[length];
157
158                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
159
160                                 string tempBuf = buf4;
161                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
162                                 delete buf4;
163
164                                 istringstream iss (tempBuf,istringstream::in);
165                                 iss >> name >> tax;
166                                 addSeqToTree(name, tax);
167                         }
168                         
169                         MPI_File_close(&inMPI);
170                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
171                 
172                 #else
173                         ifstream in;
174                         openInputFile(tfile, in);
175                         
176                         //read in users taxonomy file and add sequences to tree
177                         while(!in.eof()){
178                                 in >> name >> tax; gobble(in);
179                                 
180                                 addSeqToTree(name, tax);
181                         }
182                         in.close();
183                 #endif
184                 
185                 assignHeirarchyIDs(0);
186                 
187                 //create file for summary if needed
188                 setUp(tfile);
189         }
190         catch(exception& e) {
191                 m->errorOut(e, "PhyloTree", "PhyloTree");
192                 exit(1);
193         }
194 }
195
196 /**************************************************************************************************/
197
198 string PhyloTree::getNextTaxon(string& heirarchy){
199         try {
200                 string currentLevel = "";
201                 if(heirarchy != ""){
202                         int pos = heirarchy.find_first_of(';');
203                         currentLevel=heirarchy.substr(0,pos);
204                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
205                         else { heirarchy = ""; }
206                 }
207                 return currentLevel;
208         }
209         catch(exception& e) {
210                 m->errorOut(e, "PhyloTree", "getNextTaxon");
211                 exit(1);
212         }
213 }
214
215 /**************************************************************************************************/
216
217 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
218         try {
219                 numSeqs++;
220                 
221                 map<string, int>::iterator childPointer;
222                 
223                 int currentNode = 0;
224                 int level = 1;
225                 
226                 tree[0].accessions.push_back(seqName);
227                 string taxon;// = getNextTaxon(seqTaxonomy);
228                 
229                 while(seqTaxonomy != ""){
230                         
231                         level++;
232                         
233                         if (m->control_pressed) { return 0; }
234                         
235                         //somehow the parent is getting one too many accnos
236                         //use print to reassign the taxa id
237                         taxon = getNextTaxon(seqTaxonomy);
238                         
239                         childPointer = tree[currentNode].children.find(taxon);
240                         
241                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
242                                 currentNode = childPointer->second;
243                                 tree[currentNode].accessions.push_back(seqName);
244                                 name2Taxonomy[seqName] = currentNode;
245                         }
246                         else{                                                                                   //otherwise, create it
247                                 tree.push_back(TaxNode(taxon));
248                                 numNodes++;
249                                 tree[currentNode].children[taxon] = numNodes-1;
250                                 tree[numNodes-1].parent = currentNode;
251                                 
252                                 //                      int numChildren = tree[currentNode].children.size();
253                                 //                      string heirarchyID = tree[currentNode].heirarchyID;
254                                 //                      tree[currentNode].accessions.push_back(seqName);
255                                 
256                                 currentNode = tree[currentNode].children[taxon];
257                                 tree[currentNode].accessions.push_back(seqName);
258                                 name2Taxonomy[seqName] = currentNode;
259                                 //                      tree[currentNode].level = level;
260                                 //                      tree[currentNode].childNumber = numChildren;
261                                 //                      tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
262                         }
263                 
264                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
265                 }
266
267         }
268         catch(exception& e) {
269                 m->errorOut(e, "PhyloTree", "addSeqToTree");
270                 exit(1);
271         }
272 }
273 /**************************************************************************************************/
274 vector<int> PhyloTree::getGenusNodes()  {
275         try {
276                 genusIndex.clear();
277                 //generate genusIndexes
278                 map<int, int>::iterator it2;
279                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
280                 
281                 return genusIndex;
282         }
283         catch(exception& e) {
284                 m->errorOut(e, "PhyloTree", "getGenusNodes");
285                 exit(1);
286         }
287 }
288 /**************************************************************************************************/
289 vector<int> PhyloTree::getGenusTotals() {
290         try {
291         
292                 if (calcTotals) {
293                         totals.clear();
294                         //reset counts because we are on a new word
295                         for (int j = 0; j < genusIndex.size(); j++) {
296                                 totals.push_back(tree[genusIndex[j]].accessions.size());
297                         }
298                         return totals;
299                 }else{
300                         return totals;
301                 }
302                 
303         }
304         catch(exception& e) {
305                 m->errorOut(e, "PhyloTree", "getGenusNodes");
306                 exit(1);
307         }
308 }
309 /**************************************************************************************************/
310
311 void PhyloTree::assignHeirarchyIDs(int index){
312         try {
313                 map<string,int>::iterator it;
314                 int counter = 1;
315                 
316                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
317                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
318                         counter++;
319                         tree[it->second].level = tree[index].level + 1;
320                                                 
321                         //save maxLevel for binning the unclassified seqs
322                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
323                         
324                         assignHeirarchyIDs(it->second);
325                 }
326         }
327         catch(exception& e) {
328                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
329                 exit(1);
330         }
331 }
332 /**************************************************************************************************/
333 void PhyloTree::setUp(string tfile){
334         try{
335                 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
336                 
337                 #ifdef USE_MPI
338                         int pid;
339                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
340
341                         if (pid == 0) {  binUnclassified(taxFileNameTest);  }
342                 
343                 #else
344                         //create file needed for summary if it doesn't exist
345                         ifstream FileTest(taxFileNameTest.c_str());
346                         
347                         if (!FileTest) { 
348                                 binUnclassified(taxFileNameTest); 
349                         }
350                 #endif
351         }
352         catch(exception& e) {
353                 m->errorOut(e, "PhyloTree", "setUp");
354                 exit(1);
355         }
356 }
357 /**************************************************************************************************/
358 void PhyloTree::binUnclassified(string file){
359         try {
360         
361                 ofstream out;
362                 openOutputFile(file, out);
363                 
364                 map<string, int>::iterator itBin;
365                 map<string, int>::iterator childPointer;
366                 
367                 vector<TaxNode> copy = tree;
368                                 
369                 //fill out tree
370                 fillOutTree(0, copy);
371                 
372                 //get leaf nodes that may need externsion
373                 for (int i = 0; i < copy.size(); i++) {  
374
375                         if (copy[i].children.size() == 0) {
376                                 leafNodes[i] = i;
377                         }
378                 }
379                 
380                 int copyNodes = copy.size();
381                 
382                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
383                 map<int, int>::iterator itLeaf;
384                 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
385                         
386                         if (m->control_pressed) {  out.close(); break;  }
387                         
388                         int level = copy[itLeaf->second].level;
389                         int currentNode = itLeaf->second;
390                         
391                         //this sequence is unclassified at some levels
392                         while(level <= maxLevel){
393                 
394                                 level++;
395                         
396                                 string taxon = "unclassified";  
397                                 
398                                 //does the parent have a child names 'unclassified'?
399                                 childPointer = copy[currentNode].children.find(taxon);
400                                 
401                                 if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
402                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
403                                 }
404                                 else{                                                                                   //otherwise, create it
405                                         copy.push_back(TaxNode(taxon));
406                                         copyNodes++;
407                                         copy[currentNode].children[taxon] = copyNodes-1;
408                                         copy[copyNodes-1].parent = currentNode;
409                                         copy[copyNodes-1].level = copy[currentNode].level + 1;
410                                                                         
411                                         currentNode = copy[currentNode].children[taxon];
412                                 }
413                         }
414                 }
415                 
416                 if (!m->control_pressed) {
417                         //print copy tree
418                         print(out, copy);
419                 }
420                                 
421         }
422         catch(exception& e) {
423                 m->errorOut(e, "PhyloTree", "binUnclassified");
424                 exit(1);
425         }
426 }
427 /**************************************************************************************************/
428 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
429         try {
430                 map<string,int>::iterator it;
431                 
432                 it = copy[index].children.find("unclassified");
433                 if (it == copy[index].children.end()) { //no unclassified at this level
434                         string taxon = "unclassified";
435                         copy.push_back(TaxNode(taxon));
436                         copy[index].children[taxon] = copy.size()-1;
437                         copy[copy.size()-1].parent = index;
438                         copy[copy.size()-1].level = copy[index].level + 1;
439                 }
440                 
441                 if (tree[index].level <= maxLevel) {
442                         for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
443                                 fillOutTree(it->second, copy);
444                         }
445                 }
446                 
447         }
448         catch(exception& e) {
449                 m->errorOut(e, "PhyloTree", "fillOutTree");
450                 exit(1);
451         }
452 }
453 /**************************************************************************************************/
454 string PhyloTree::getFullTaxonomy(string seqName) {
455         try {
456                 string tax = "";
457                 
458                 int currentNode = name2Taxonomy[seqName];
459                 
460                 while (tree[currentNode].parent != -1) {
461                         tax = tree[currentNode].name + ";" + tax;
462                         currentNode = tree[currentNode].parent;
463                 }
464                 
465                 return tax;
466         }
467         catch(exception& e) {
468                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
469                 exit(1);
470         }
471 }
472 /**************************************************************************************************/
473
474 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
475         try {
476                 out << copy.size() << endl;
477                 
478                 for (int i = 0; i < copy.size(); i++) {
479         
480                         out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
481                         
482                         map<string,int>::iterator it;
483                         for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
484                                 out << it->first << '\t' << it->second << '\t';
485                         }
486                         out << endl;
487                 }
488                 
489                 out.close();
490         }
491         catch(exception& e) {
492                 m->errorOut(e, "PhyloTree", "print");
493                 exit(1);
494         }
495 }
496 /**************************************************************************************************/
497 void PhyloTree::printTreeNodes(string treefilename) {
498         try {
499         
500                 #ifdef USE_MPI
501                         int pid;
502                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
503
504                         if (pid == 0) {  
505                 
506                 #endif
507
508                         ofstream outTree;
509                         openOutputFile(treefilename, outTree);
510                         
511                         //print treenodes
512                         outTree << tree.size() << endl;
513                         for (int i = 0; i < tree.size(); i++) {
514                                 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
515                         }
516                         
517                         //print genus nodes
518                         outTree << endl << uniqueTaxonomies.size() << endl;
519                         map<int, int>::iterator it2;
520                         for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
521                         outTree << endl;
522                         
523                         
524                         outTree.close();
525                 
526                 #ifdef USE_MPI
527                         }
528                 #endif
529
530                 
531         }
532         catch(exception& e) {
533                 m->errorOut(e, "PhyloTree", "printTreeNodes");
534                 exit(1);
535         }
536 }
537 /**************************************************************************************************/
538
539
540