5 * Created by Pat Schloss on 6/17/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "phylotree.h"
12 /**************************************************************************************************/
14 PhyloTree::PhyloTree(){
16 m = MothurOut::getInstance();
19 tree.push_back(TaxNode("Root"));
20 tree[0].heirarchyID = "0";
23 addSeqToTree("unknown", "unknown;");
26 m->errorOut(e, "PhyloTree", "PhyloTree");
30 /**************************************************************************************************/
32 PhyloTree::PhyloTree(ifstream& in, string filename){
34 m = MothurOut::getInstance();
44 char inFileName[1024];
45 strcpy(inFileName, filename.c_str());
47 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
48 MPI_File_get_size(inMPI, &size);
50 char* buffer = new char[size];
51 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
53 string tempBuf = buffer;
54 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
55 istringstream iss (tempBuf,istringstream::in);
59 m->getline(iss); m->gobble(iss);
61 iss >> numNodes; m->gobble(iss);
63 tree.resize(numNodes);
65 for (int i = 0; i < tree.size(); i++) {
66 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
71 iss >> numGenus; m->gobble(iss);
75 for (int i = 0; i < numGenus; i++) {
76 iss >> gnode >> gsize; m->gobble(iss);
78 uniqueTaxonomies[gnode] = gnode;
79 totals.push_back(gsize);
82 MPI_File_close(&inMPI);
86 string line = m->getline(in); m->gobble(in);
88 in >> numNodes; m->gobble(in);
90 tree.resize(numNodes);
92 for (int i = 0; i < tree.size(); i++) {
93 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
98 in >> numGenus; m->gobble(in);
102 for (int i = 0; i < numGenus; i++) {
103 in >> gnode >> gsize; m->gobble(in);
105 uniqueTaxonomies[gnode] = gnode;
106 totals.push_back(gsize);
114 catch(exception& e) {
115 m->errorOut(e, "PhyloTree", "PhyloTree");
119 /**************************************************************************************************/
121 PhyloTree::PhyloTree(string tfile){
123 m = MothurOut::getInstance();
126 tree.push_back(TaxNode("Root"));
127 tree[0].heirarchyID = "0";
131 addSeqToTree("unknown", "unknown;");
135 int pid, num, processors;
136 vector<unsigned long long> positions;
140 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
141 MPI_Comm_size(MPI_COMM_WORLD, &processors);
143 char inFileName[1024];
144 strcpy(inFileName, tfile.c_str());
146 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
149 positions = m->setFilePosEachLine(tfile, num);
151 //send file positions to all processes
152 for(int i = 1; i < processors; i++) {
153 MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
154 MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
157 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
158 positions.resize(num+1);
159 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
163 for(int i=0;i<num;i++){
165 int length = positions[i+1] - positions[i];
166 char* buf4 = new char[length];
168 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
170 string tempBuf = buf4;
171 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
174 istringstream iss (tempBuf,istringstream::in);
176 addSeqToTree(name, tax);
179 MPI_File_close(&inMPI);
180 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
184 m->openInputFile(tfile, in);
186 //read in users taxonomy file and add sequences to tree
188 in >> name >> tax; m->gobble(in);
190 addSeqToTree(name, tax);
195 assignHeirarchyIDs(0);
197 //create file for summary if needed
200 catch(exception& e) {
201 m->errorOut(e, "PhyloTree", "PhyloTree");
206 /**************************************************************************************************/
208 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
210 string currentLevel = "";
212 int pos = heirarchy.find_first_of(';');
214 if (pos == -1) { //you can't find another ;
215 currentLevel = heirarchy;
217 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
219 currentLevel=heirarchy.substr(0,pos);
220 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
221 else { heirarchy = ""; }
227 catch(exception& e) {
228 m->errorOut(e, "PhyloTree", "getNextTaxon");
233 /**************************************************************************************************/
235 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
239 map<string, int>::iterator childPointer;
244 tree[0].accessions.push_back(seqName);
245 m->removeConfidences(seqTaxonomy);
247 string taxon;// = getNextTaxon(seqTaxonomy);
249 while(seqTaxonomy != ""){
253 if (m->control_pressed) { return 0; }
255 //somehow the parent is getting one too many accnos
256 //use print to reassign the taxa id
257 taxon = getNextTaxon(seqTaxonomy, seqName);
259 if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; }
261 childPointer = tree[currentNode].children.find(taxon);
263 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
264 currentNode = childPointer->second;
265 tree[currentNode].accessions.push_back(seqName);
266 name2Taxonomy[seqName] = currentNode;
268 else{ //otherwise, create it
269 tree.push_back(TaxNode(taxon));
271 tree[currentNode].children[taxon] = numNodes-1;
272 tree[numNodes-1].parent = currentNode;
274 currentNode = tree[currentNode].children[taxon];
275 tree[currentNode].accessions.push_back(seqName);
276 name2Taxonomy[seqName] = currentNode;
279 if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; }
284 catch(exception& e) {
285 m->errorOut(e, "PhyloTree", "addSeqToTree");
289 /**************************************************************************************************/
290 vector<int> PhyloTree::getGenusNodes() {
293 //generate genusIndexes
294 map<int, int>::iterator it2;
295 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); }
299 catch(exception& e) {
300 m->errorOut(e, "PhyloTree", "getGenusNodes");
304 /**************************************************************************************************/
305 vector<int> PhyloTree::getGenusTotals() {
310 //reset counts because we are on a new word
311 for (int j = 0; j < genusIndex.size(); j++) {
312 totals.push_back(tree[genusIndex[j]].accessions.size());
320 catch(exception& e) {
321 m->errorOut(e, "PhyloTree", "getGenusNodes");
325 /**************************************************************************************************/
327 void PhyloTree::assignHeirarchyIDs(int index){
329 map<string,int>::iterator it;
332 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
333 tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
335 tree[it->second].level = tree[index].level + 1;
337 //save maxLevel for binning the unclassified seqs
338 if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
340 assignHeirarchyIDs(it->second);
343 catch(exception& e) {
344 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
348 /**************************************************************************************************/
349 void PhyloTree::setUp(string tfile){
351 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
355 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
357 if (pid == 0) { binUnclassified(taxFileNameTest); }
360 binUnclassified(taxFileNameTest);
363 catch(exception& e) {
364 m->errorOut(e, "PhyloTree", "setUp");
368 /**************************************************************************************************/
369 void PhyloTree::binUnclassified(string file){
373 m->openOutputFile(file, out);
375 map<string, int>::iterator itBin;
376 map<string, int>::iterator childPointer;
378 vector<TaxNode> copy = tree;
381 fillOutTree(0, copy);
383 //get leaf nodes that may need extension
384 for (int i = 0; i < copy.size(); i++) {
386 if (copy[i].children.size() == 0) {
391 int copyNodes = copy.size();
393 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
394 map<int, int>::iterator itLeaf;
395 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
397 if (m->control_pressed) { out.close(); break; }
399 int level = copy[itLeaf->second].level;
400 int currentNode = itLeaf->second;
402 //this sequence is unclassified at some levels
403 while(level < maxLevel){
407 string taxon = "unclassified";
409 //does the parent have a child names 'unclassified'?
410 childPointer = copy[currentNode].children.find(taxon);
412 if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on
413 currentNode = childPointer->second; //currentNode becomes 'unclassified'
415 else{ //otherwise, create it
416 copy.push_back(TaxNode(taxon));
418 copy[currentNode].children[taxon] = copyNodes-1;
419 copy[copyNodes-1].parent = currentNode;
420 copy[copyNodes-1].level = copy[currentNode].level + 1;
422 currentNode = copy[currentNode].children[taxon];
427 if (!m->control_pressed) {
433 catch(exception& e) {
434 m->errorOut(e, "PhyloTree", "binUnclassified");
438 /**************************************************************************************************/
439 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
442 map<string,int>::iterator it;
444 it = copy[index].children.find("unclassified");
445 if (it == copy[index].children.end()) { //no unclassified at this level
446 string taxon = "unclassified";
447 copy.push_back(TaxNode(taxon));
448 copy[index].children[taxon] = copy.size()-1;
449 copy[copy.size()-1].parent = index;
450 copy[copy.size()-1].level = copy[index].level + 1;
453 if (tree[index].level < maxLevel) {
454 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
455 fillOutTree(it->second, copy);
460 catch(exception& e) {
461 m->errorOut(e, "PhyloTree", "fillOutTree");
465 /**************************************************************************************************/
466 string PhyloTree::getFullTaxonomy(string seqName) {
470 int currentNode = name2Taxonomy[seqName];
472 while (tree[currentNode].parent != -1) {
473 tax = tree[currentNode].name + ";" + tax;
474 currentNode = tree[currentNode].parent;
479 catch(exception& e) {
480 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
484 /**************************************************************************************************/
486 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
489 //output mothur version
490 out << "#" << m->getVersion() << endl;
492 out << copy.size() << endl;
494 out << maxLevel << endl;
496 for (int i = 0; i < copy.size(); i++) {
498 out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
500 map<string,int>::iterator it;
501 for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
502 out << it->first << '\t' << it->second << '\t';
509 catch(exception& e) {
510 m->errorOut(e, "PhyloTree", "print");
514 /**************************************************************************************************/
515 void PhyloTree::printTreeNodes(string treefilename) {
520 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
527 m->openOutputFile(treefilename, outTree);
529 //output mothur version
530 outTree << "#" << m->getVersion() << endl;
533 outTree << tree.size() << endl;
534 for (int i = 0; i < tree.size(); i++) {
535 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
539 outTree << endl << uniqueTaxonomies.size() << endl;
540 map<int, int>::iterator it2;
541 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
552 catch(exception& e) {
553 m->errorOut(e, "PhyloTree", "printTreeNodes");
557 /**************************************************************************************************/
558 TaxNode PhyloTree::get(int i ){
560 if (i < tree.size()) { return tree[i]; }
561 else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
563 catch(exception& e) {
564 m->errorOut(e, "PhyloTree", "get");
568 /**************************************************************************************************/
569 TaxNode PhyloTree::get(string seqName){
571 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
573 if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; }
574 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
576 catch(exception& e) {
577 m->errorOut(e, "PhyloTree", "get");
581 /**************************************************************************************************/
582 string PhyloTree::getName(int i ){
584 if (i < tree.size()) { return tree[i].name; }
585 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
587 catch(exception& e) {
588 m->errorOut(e, "PhyloTree", "get");
592 /**************************************************************************************************/
593 int PhyloTree::getIndex(string seqName){
595 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
597 if (itFind != name2Taxonomy.end()) { return name2Taxonomy[seqName]; }
598 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
600 catch(exception& e) {
601 m->errorOut(e, "PhyloTree", "get");
605 /**************************************************************************************************/
606 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
610 templateFileNames.push_back("unknown");
612 map<string, int>::iterator itFind;
613 map<string, int> taxonomyFileNames = name2Taxonomy;
615 for (int i = 0; i < templateFileNames.size(); i++) {
616 itFind = taxonomyFileNames.find(templateFileNames[i]);
618 if (itFind != taxonomyFileNames.end()) { //found it so erase it
619 taxonomyFileNames.erase(itFind);
621 m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
625 //templateFileNames.erase(templateFileNames.begin()+i);
628 templateFileNames.clear();
630 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
633 for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
634 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
640 catch(exception& e) {
641 m->errorOut(e, "PhyloTree", "ErrorCheck");
645 /**************************************************************************************************/