5 * Created by Pat Schloss on 6/17/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "phylotree.h"
12 /**************************************************************************************************/
14 PhyloTree::PhyloTree(){
16 m = MothurOut::getInstance();
19 tree.push_back(TaxNode("Root"));
20 tree[0].heirarchyID = "0";
23 addSeqToTree("unknown", "unknown;");
26 m->errorOut(e, "PhyloTree", "PhyloTree");
30 /**************************************************************************************************/
32 PhyloTree::PhyloTree(ifstream& in, string filename){
34 m = MothurOut::getInstance();
44 char inFileName[1024];
45 strcpy(inFileName, filename.c_str());
47 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
48 MPI_File_get_size(inMPI, &size);
50 char* buffer = new char[size];
51 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
53 string tempBuf = buffer;
54 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
55 istringstream iss (tempBuf,istringstream::in);
59 m->getline(iss); m->gobble(iss);
61 iss >> numNodes; m->gobble(iss);
63 tree.resize(numNodes);
65 for (int i = 0; i < tree.size(); i++) {
66 iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss);
71 iss >> numGenus; m->gobble(iss);
75 for (int i = 0; i < numGenus; i++) {
76 iss >> gnode >> gsize; m->gobble(iss);
78 uniqueTaxonomies.insert(gnode);
79 totals.push_back(gsize);
82 MPI_File_close(&inMPI);
86 string line = m->getline(in); m->gobble(in);
88 in >> numNodes; m->gobble(in);
90 tree.resize(numNodes);
92 for (int i = 0; i < tree.size(); i++) {
93 in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in);
98 in >> numGenus; m->gobble(in);
102 for (int i = 0; i < numGenus; i++) {
103 in >> gnode >> gsize; m->gobble(in);
105 uniqueTaxonomies.insert(gnode);
106 totals.push_back(gsize);
114 catch(exception& e) {
115 m->errorOut(e, "PhyloTree", "PhyloTree");
119 /**************************************************************************************************/
121 PhyloTree::PhyloTree(string tfile){
123 m = MothurOut::getInstance();
126 tree.push_back(TaxNode("Root"));
127 tree[0].heirarchyID = "0";
133 int pid, num, processors;
134 vector<unsigned long long> positions;
138 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
139 MPI_Comm_size(MPI_COMM_WORLD, &processors);
141 char inFileName[1024];
142 strcpy(inFileName, tfile.c_str());
144 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
147 positions = m->setFilePosEachLine(tfile, num);
149 //send file positions to all processes
150 for(int i = 1; i < processors; i++) {
151 MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
152 MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
155 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
156 positions.resize(num+1);
157 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
161 for(int i=0;i<num;i++){
163 int length = positions[i+1] - positions[i];
164 char* buf4 = new char[length];
166 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
168 string tempBuf = buf4;
169 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
172 istringstream iss (tempBuf,istringstream::in);
174 addSeqToTree(name, tax);
177 MPI_File_close(&inMPI);
178 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
181 map<string, string> temp;
182 m->readTax(tfile, temp);
184 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
185 addSeqToTree(itTemp->first, itTemp->second);
186 temp.erase(itTemp++);
190 assignHeirarchyIDs(0);
193 string unknownTax = "unknown;";
194 //added last taxon until you get desired level
195 for (int i = 1; i < maxLevel; i++) {
196 unknownTax += "unclassfied;";
199 addSeqToTree("unknown", unknownTax);
201 //create file for summary if needed
204 catch(exception& e) {
205 m->errorOut(e, "PhyloTree", "PhyloTree");
210 /**************************************************************************************************/
212 string PhyloTree::getNextTaxon(string& heirarchy, string seqname){
214 string currentLevel = "";
216 int pos = heirarchy.find_first_of(';');
218 if (pos == -1) { //you can't find another ;
219 currentLevel = heirarchy;
221 m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine();
223 currentLevel=heirarchy.substr(0,pos);
224 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
225 else { heirarchy = ""; }
231 catch(exception& e) {
232 m->errorOut(e, "PhyloTree", "getNextTaxon");
236 /**************************************************************************************************/
238 vector<string> PhyloTree::getSeqs(string seqTaxonomy){
240 string taxCopy = seqTaxonomy;
241 vector<string> names;
242 map<string, int>::iterator childPointer;
246 m->removeConfidences(seqTaxonomy);
249 while(seqTaxonomy != ""){
251 if (m->control_pressed) { return names; }
253 taxon = getNextTaxon(seqTaxonomy, "");
255 if (m->debug) { m->mothurOut(taxon +'\n'); }
257 if (taxon == "") { m->mothurOut(taxCopy + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); break; }
259 childPointer = tree[currentNode].children.find(taxon);
261 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
262 currentNode = childPointer->second;
264 else{ //otherwise, error this taxonomy is not in tree
265 m->mothurOut("[ERROR]: " + taxCopy + " is not in taxonomy tree, please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
268 if (seqTaxonomy == "") { names = tree[currentNode].accessions; }
273 catch(exception& e) {
274 m->errorOut(e, "PhyloTree", "getSeqs");
279 /**************************************************************************************************/
281 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
285 map<string, int>::iterator childPointer;
290 tree[0].accessions.push_back(seqName);
291 m->removeConfidences(seqTaxonomy);
293 string taxon;// = getNextTaxon(seqTaxonomy);
295 while(seqTaxonomy != ""){
299 if (m->control_pressed) { return 0; }
301 //somehow the parent is getting one too many accnos
302 //use print to reassign the taxa id
303 taxon = getNextTaxon(seqTaxonomy, seqName);
305 if (m->debug) { m->mothurOut(seqName +'\t' + taxon +'\n'); }
307 if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies.insert(currentNode); } break; }
309 childPointer = tree[currentNode].children.find(taxon);
311 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
312 currentNode = childPointer->second;
313 tree[currentNode].accessions.push_back(seqName);
314 name2Taxonomy[seqName] = currentNode;
316 else{ //otherwise, create it
317 tree.push_back(TaxNode(taxon));
319 tree[currentNode].children[taxon] = numNodes-1;
320 tree[numNodes-1].parent = currentNode;
322 currentNode = tree[currentNode].children[taxon];
323 tree[currentNode].accessions.push_back(seqName);
324 name2Taxonomy[seqName] = currentNode;
327 if (seqTaxonomy == "") { uniqueTaxonomies.insert(currentNode); }
332 catch(exception& e) {
333 m->errorOut(e, "PhyloTree", "addSeqToTree");
337 /**************************************************************************************************/
338 vector<int> PhyloTree::getGenusNodes() {
341 //generate genusIndexes
342 set<int>::iterator it2;
344 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(*it2); temp[*it2] = genusIndex.size()-1; }
346 for (map<string, int>::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) {
347 map<int, int>::iterator itTemp = temp.find(itName->second);
348 if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; }
349 else { m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; }
354 catch(exception& e) {
355 m->errorOut(e, "PhyloTree", "getGenusNodes");
359 /**************************************************************************************************/
360 vector<int> PhyloTree::getGenusTotals() {
365 //reset counts because we are on a new word
366 for (int j = 0; j < genusIndex.size(); j++) {
367 totals.push_back(tree[genusIndex[j]].accessions.size());
375 catch(exception& e) {
376 m->errorOut(e, "PhyloTree", "getGenusNodes");
380 /**************************************************************************************************/
382 void PhyloTree::assignHeirarchyIDs(int index){
384 map<string,int>::iterator it;
387 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
389 if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); }
391 tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
393 tree[it->second].level = tree[index].level + 1;
395 //save maxLevel for binning the unclassified seqs
396 if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
398 assignHeirarchyIDs(it->second);
401 catch(exception& e) {
402 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
406 /**************************************************************************************************/
407 void PhyloTree::setUp(string tfile){
409 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
413 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
415 if (pid == 0) { binUnclassified(taxFileNameTest); }
418 binUnclassified(taxFileNameTest);
421 catch(exception& e) {
422 m->errorOut(e, "PhyloTree", "setUp");
426 /**************************************************************************************************/
427 void PhyloTree::binUnclassified(string file){
431 m->openOutputFile(file, out);
433 map<string, int>::iterator itBin;
434 map<string, int>::iterator childPointer;
436 vector<TaxNode> copy = tree;
439 fillOutTree(0, copy);
441 //get leaf nodes that may need extension
442 for (int i = 0; i < copy.size(); i++) {
444 if (copy[i].children.size() == 0) {
449 if (m->debug) { m->mothurOut("maxLevel = " + toString(maxLevel) +'\n'); }
451 int copyNodes = copy.size();
453 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
454 map<int, int>::iterator itLeaf;
455 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
457 if (m->control_pressed) { out.close(); break; }
459 int level = copy[itLeaf->second].level;
460 int currentNode = itLeaf->second;
462 if (m->debug) { m->mothurOut(copy[currentNode].name +'\n'); }
464 //this sequence is unclassified at some levels
465 while(level < maxLevel){
468 if (m->debug) { m->mothurOut("level = " + toString(level) +'\n'); }
470 string taxon = "unclassified";
472 //does the parent have a child names 'unclassified'?
473 childPointer = copy[currentNode].children.find(taxon);
475 if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on
476 currentNode = childPointer->second; //currentNode becomes 'unclassified'
478 else{ //otherwise, create it
479 copy.push_back(TaxNode(taxon));
481 copy[currentNode].children[taxon] = copyNodes-1;
482 copy[copyNodes-1].parent = currentNode;
483 copy[copyNodes-1].level = copy[currentNode].level + 1;
485 currentNode = copy[currentNode].children[taxon];
490 if (!m->control_pressed) {
496 catch(exception& e) {
497 m->errorOut(e, "PhyloTree", "binUnclassified");
501 /**************************************************************************************************/
502 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
505 map<string,int>::iterator it;
507 it = copy[index].children.find("unclassified");
508 if (it == copy[index].children.end()) { //no unclassified at this level
509 string taxon = "unclassified";
510 copy.push_back(TaxNode(taxon));
511 copy[index].children[taxon] = copy.size()-1;
512 copy[copy.size()-1].parent = index;
513 copy[copy.size()-1].level = copy[index].level + 1;
516 if (tree[index].level < maxLevel) {
517 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
518 fillOutTree(it->second, copy);
523 catch(exception& e) {
524 m->errorOut(e, "PhyloTree", "fillOutTree");
528 /**************************************************************************************************/
529 string PhyloTree::getFullTaxonomy(string seqName) {
533 int currentNode = name2Taxonomy[seqName];
535 while (tree[currentNode].parent != -1) {
536 tax = tree[currentNode].name + ";" + tax;
537 currentNode = tree[currentNode].parent;
542 catch(exception& e) {
543 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
547 /**************************************************************************************************/
549 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
552 //output mothur version
553 out << "#" << m->getVersion() << endl;
555 out << copy.size() << endl;
557 out << maxLevel << endl;
559 for (int i = 0; i < copy.size(); i++) {
561 out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
563 map<string,int>::iterator it;
564 for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
565 out << it->first << '\t' << it->second << '\t';
572 catch(exception& e) {
573 m->errorOut(e, "PhyloTree", "print");
577 /**************************************************************************************************/
578 void PhyloTree::printTreeNodes(string treefilename) {
583 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
590 m->openOutputFile(treefilename, outTree);
592 //output mothur version
593 outTree << "#" << m->getVersion() << endl;
596 outTree << tree.size() << endl;
597 for (int i = 0; i < tree.size(); i++) {
598 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
602 outTree << endl << uniqueTaxonomies.size() << endl;
603 set<int>::iterator it2;
604 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl; }
615 catch(exception& e) {
616 m->errorOut(e, "PhyloTree", "printTreeNodes");
620 /**************************************************************************************************/
621 TaxNode PhyloTree::get(int i ){
623 if (i < tree.size()) { return tree[i]; }
624 else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
626 catch(exception& e) {
627 m->errorOut(e, "PhyloTree", "get");
631 /**************************************************************************************************/
632 TaxNode PhyloTree::get(string seqName){
634 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
636 if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; }
637 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
639 catch(exception& e) {
640 m->errorOut(e, "PhyloTree", "get");
644 /**************************************************************************************************/
645 string PhyloTree::getName(int i ){
647 if (i < tree.size()) { return tree[i].name; }
648 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
650 catch(exception& e) {
651 m->errorOut(e, "PhyloTree", "get");
655 /**************************************************************************************************/
656 int PhyloTree::getGenusIndex(string seqName){
658 map<string, int>::iterator itFind = name2GenusNodeIndex.find(seqName);
660 if (itFind != name2GenusNodeIndex.end()) { return itFind->second; }
661 else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
663 catch(exception& e) {
664 m->errorOut(e, "PhyloTree", "get");
668 /**************************************************************************************************/
669 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
673 templateFileNames.push_back("unknown");
675 map<string, int>::iterator itFind;
676 map<string, int> taxonomyFileNames = name2Taxonomy;
678 if (m->debug) { m->mothurOut("[DEBUG]: in error check. Numseqs in template = " + toString(templateFileNames.size()) + ". Numseqs in taxonomy = " + toString(taxonomyFileNames.size()) + ".\n"); }
680 for (int i = 0; i < templateFileNames.size(); i++) {
681 itFind = taxonomyFileNames.find(templateFileNames[i]);
683 if (itFind != taxonomyFileNames.end()) { //found it so erase it
684 taxonomyFileNames.erase(itFind);
686 m->mothurOut("'" +templateFileNames[i] + "' is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
690 //templateFileNames.erase(templateFileNames.begin()+i);
693 templateFileNames.clear();
695 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
698 for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
699 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
705 catch(exception& e) {
706 m->errorOut(e, "PhyloTree", "ErrorCheck");
710 /**************************************************************************************************/