5 * Created by Pat Schloss on 6/17/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "phylotree.h"
12 /**************************************************************************************************/
14 PhyloTree::PhyloTree(){
16 m = MothurOut::getInstance();
19 tree.push_back(TaxNode("Root"));
20 tree[0].heirarchyID = "0";
25 m->errorOut(e, "PhyloTree", "PhyloTree");
29 /**************************************************************************************************/
31 PhyloTree::PhyloTree(ifstream& in, string filename){
33 m = MothurOut::getInstance();
43 char inFileName[1024];
44 strcpy(inFileName, filename.c_str());
46 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
47 MPI_File_get_size(inMPI, &size);
49 char* buffer = new char[size];
50 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
52 string tempBuf = buffer;
53 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
54 istringstream iss (tempBuf,istringstream::in);
57 iss >> numNodes; gobble(iss);
59 tree.resize(numNodes);
61 for (int i = 0; i < tree.size(); i++) {
62 iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
67 iss >> numGenus; gobble(iss);
71 for (int i = 0; i < numGenus; i++) {
72 iss >> gnode >> gsize; gobble(iss);
74 uniqueTaxonomies[gnode] = gnode;
75 totals.push_back(gsize);
78 MPI_File_close(&inMPI);
81 in >> numNodes; gobble(in);
83 tree.resize(numNodes);
85 for (int i = 0; i < tree.size(); i++) {
86 in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
91 in >> numGenus; gobble(in);
95 for (int i = 0; i < numGenus; i++) {
96 in >> gnode >> gsize; gobble(in);
98 uniqueTaxonomies[gnode] = gnode;
99 totals.push_back(gsize);
107 catch(exception& e) {
108 m->errorOut(e, "PhyloTree", "PhyloTree");
112 /**************************************************************************************************/
114 PhyloTree::PhyloTree(string tfile){
116 m = MothurOut::getInstance();
119 tree.push_back(TaxNode("Root"));
120 tree[0].heirarchyID = "0";
127 int pid, num, processors;
128 vector<long> positions;
132 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
133 MPI_Comm_size(MPI_COMM_WORLD, &processors);
135 char inFileName[1024];
136 strcpy(inFileName, tfile.c_str());
138 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
141 positions = setFilePosEachLine(tfile, num);
143 //send file positions to all processes
144 for(int i = 1; i < processors; i++) {
145 MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
146 MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
149 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
150 positions.resize(num+1);
151 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
155 for(int i=0;i<num;i++){
157 int length = positions[i+1] - positions[i];
158 char* buf4 = new char[length];
160 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
162 string tempBuf = buf4;
163 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
166 istringstream iss (tempBuf,istringstream::in);
168 addSeqToTree(name, tax);
171 MPI_File_close(&inMPI);
172 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
176 openInputFile(tfile, in);
178 //read in users taxonomy file and add sequences to tree
180 in >> name >> tax; gobble(in);
182 addSeqToTree(name, tax);
187 assignHeirarchyIDs(0);
189 //create file for summary if needed
192 catch(exception& e) {
193 m->errorOut(e, "PhyloTree", "PhyloTree");
198 /**************************************************************************************************/
200 string PhyloTree::getNextTaxon(string& heirarchy){
202 string currentLevel = "";
204 int pos = heirarchy.find_first_of(';');
205 currentLevel=heirarchy.substr(0,pos);
206 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
207 else { heirarchy = ""; }
211 catch(exception& e) {
212 m->errorOut(e, "PhyloTree", "getNextTaxon");
217 /**************************************************************************************************/
219 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
224 map<string, int>::iterator childPointer;
229 tree[0].accessions.push_back(seqName);
230 string taxon;// = getNextTaxon(seqTaxonomy);
232 while(seqTaxonomy != ""){
236 if (m->control_pressed) { return 0; }
238 //somehow the parent is getting one too many accnos
239 //use print to reassign the taxa id
240 taxon = getNextTaxon(seqTaxonomy);
242 if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; }
244 childPointer = tree[currentNode].children.find(taxon);
246 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
247 currentNode = childPointer->second;
248 tree[currentNode].accessions.push_back(seqName);
249 name2Taxonomy[seqName] = currentNode;
251 else{ //otherwise, create it
252 tree.push_back(TaxNode(taxon));
254 tree[currentNode].children[taxon] = numNodes-1;
255 tree[numNodes-1].parent = currentNode;
257 // int numChildren = tree[currentNode].children.size();
258 // string heirarchyID = tree[currentNode].heirarchyID;
259 // tree[currentNode].accessions.push_back(seqName);
261 currentNode = tree[currentNode].children[taxon];
262 tree[currentNode].accessions.push_back(seqName);
263 name2Taxonomy[seqName] = currentNode;
264 // tree[currentNode].level = level;
265 // tree[currentNode].childNumber = numChildren;
266 // tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
269 if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; }
273 catch(exception& e) {
274 m->errorOut(e, "PhyloTree", "addSeqToTree");
278 /**************************************************************************************************/
279 vector<int> PhyloTree::getGenusNodes() {
282 //generate genusIndexes
283 map<int, int>::iterator it2;
284 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); }
288 catch(exception& e) {
289 m->errorOut(e, "PhyloTree", "getGenusNodes");
293 /**************************************************************************************************/
294 vector<int> PhyloTree::getGenusTotals() {
299 //reset counts because we are on a new word
300 for (int j = 0; j < genusIndex.size(); j++) {
301 totals.push_back(tree[genusIndex[j]].accessions.size());
309 catch(exception& e) {
310 m->errorOut(e, "PhyloTree", "getGenusNodes");
314 /**************************************************************************************************/
316 void PhyloTree::assignHeirarchyIDs(int index){
318 map<string,int>::iterator it;
321 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
322 tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
324 tree[it->second].level = tree[index].level + 1;
326 //save maxLevel for binning the unclassified seqs
327 if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
329 assignHeirarchyIDs(it->second);
332 catch(exception& e) {
333 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
337 /**************************************************************************************************/
338 void PhyloTree::setUp(string tfile){
340 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
344 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
346 if (pid == 0) { binUnclassified(taxFileNameTest); }
349 //create file needed for summary if it doesn't exist
350 ifstream FileTest(taxFileNameTest.c_str());
353 binUnclassified(taxFileNameTest);
357 catch(exception& e) {
358 m->errorOut(e, "PhyloTree", "setUp");
362 /**************************************************************************************************/
363 void PhyloTree::binUnclassified(string file){
367 openOutputFile(file, out);
369 map<string, int>::iterator itBin;
370 map<string, int>::iterator childPointer;
372 vector<TaxNode> copy = tree;
375 fillOutTree(0, copy);
377 //get leaf nodes that may need externsion
378 for (int i = 0; i < copy.size(); i++) {
380 if (copy[i].children.size() == 0) {
385 int copyNodes = copy.size();
387 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
388 map<int, int>::iterator itLeaf;
389 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
391 if (m->control_pressed) { out.close(); break; }
393 int level = copy[itLeaf->second].level;
394 int currentNode = itLeaf->second;
396 //this sequence is unclassified at some levels
397 while(level <= maxLevel){
401 string taxon = "unclassified";
403 //does the parent have a child names 'unclassified'?
404 childPointer = copy[currentNode].children.find(taxon);
406 if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on
407 currentNode = childPointer->second; //currentNode becomes 'unclassified'
409 else{ //otherwise, create it
410 copy.push_back(TaxNode(taxon));
412 copy[currentNode].children[taxon] = copyNodes-1;
413 copy[copyNodes-1].parent = currentNode;
414 copy[copyNodes-1].level = copy[currentNode].level + 1;
416 currentNode = copy[currentNode].children[taxon];
421 if (!m->control_pressed) {
427 catch(exception& e) {
428 m->errorOut(e, "PhyloTree", "binUnclassified");
432 /**************************************************************************************************/
433 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
435 map<string,int>::iterator it;
437 it = copy[index].children.find("unclassified");
438 if (it == copy[index].children.end()) { //no unclassified at this level
439 string taxon = "unclassified";
440 copy.push_back(TaxNode(taxon));
441 copy[index].children[taxon] = copy.size()-1;
442 copy[copy.size()-1].parent = index;
443 copy[copy.size()-1].level = copy[index].level + 1;
446 if (tree[index].level <= maxLevel) {
447 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
448 fillOutTree(it->second, copy);
453 catch(exception& e) {
454 m->errorOut(e, "PhyloTree", "fillOutTree");
458 /**************************************************************************************************/
459 string PhyloTree::getFullTaxonomy(string seqName) {
463 int currentNode = name2Taxonomy[seqName];
465 while (tree[currentNode].parent != -1) {
466 tax = tree[currentNode].name + ";" + tax;
467 currentNode = tree[currentNode].parent;
472 catch(exception& e) {
473 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
477 /**************************************************************************************************/
479 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
481 out << copy.size() << endl;
483 for (int i = 0; i < copy.size(); i++) {
485 out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
487 map<string,int>::iterator it;
488 for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
489 out << it->first << '\t' << it->second << '\t';
496 catch(exception& e) {
497 m->errorOut(e, "PhyloTree", "print");
501 /**************************************************************************************************/
502 void PhyloTree::printTreeNodes(string treefilename) {
507 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
514 openOutputFile(treefilename, outTree);
517 outTree << tree.size() << endl;
518 for (int i = 0; i < tree.size(); i++) {
519 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
523 outTree << endl << uniqueTaxonomies.size() << endl;
524 map<int, int>::iterator it2;
525 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
536 catch(exception& e) {
537 m->errorOut(e, "PhyloTree", "printTreeNodes");
541 /**************************************************************************************************/
542 TaxNode PhyloTree::get(int i ){
544 if (i < tree.size()) { return tree[i]; }
545 else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
547 catch(exception& e) {
548 m->errorOut(e, "PhyloTree", "get");
552 /**************************************************************************************************/
553 TaxNode PhyloTree::get(string seqName){
555 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
557 if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; }
558 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
560 catch(exception& e) {
561 m->errorOut(e, "PhyloTree", "get");
565 /**************************************************************************************************/
566 string PhyloTree::getName(int i ){
568 if (i < tree.size()) { return tree[i].name; }
569 else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); }
571 catch(exception& e) {
572 m->errorOut(e, "PhyloTree", "get");
576 /**************************************************************************************************/
577 int PhyloTree::getIndex(string seqName){
579 map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
581 if (itFind != name2Taxonomy.end()) { return name2Taxonomy[seqName]; }
582 else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
584 catch(exception& e) {
585 m->errorOut(e, "PhyloTree", "get");
589 /**************************************************************************************************/
590 bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
595 map<string, int>::iterator itFind;
596 map<string, int> taxonomyFileNames = name2Taxonomy;
598 for (int i = 0; i < templateFileNames.size(); i++) {
599 itFind = taxonomyFileNames.find(templateFileNames[i]);
601 if (itFind != name2Taxonomy.end()) { //found it so erase it
602 taxonomyFileNames.erase(itFind);
604 m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
608 templateFileNames.erase(templateFileNames.begin()+i);
612 if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template
615 for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) {
616 m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine();
622 catch(exception& e) {
623 m->errorOut(e, "PhyloTree", "ErrorCheck");
627 /**************************************************************************************************/