5 * Created by Pat Schloss on 6/17/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "phylotree.h"
12 /**************************************************************************************************/
14 PhyloTree::PhyloTree(){
16 m = MothurOut::getInstance();
19 tree.push_back(TaxNode("Root"));
20 tree[0].heirarchyID = "0";
25 m->errorOut(e, "PhyloTree", "PhyloTree");
29 /**************************************************************************************************/
31 PhyloTree::PhyloTree(ifstream& in, string filename){
33 m = MothurOut::getInstance();
41 char inFileName[1024];
42 strcpy(inFileName, filename.c_str());
44 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
45 MPI_File_get_size(inMPI, &size);
47 char* buffer = new char[size];
48 MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
50 string tempBuf = buffer;
51 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
52 istringstream iss (tempBuf,istringstream::in);
55 iss >> numNodes; gobble(iss);
57 tree.resize(numNodes);
59 for (int i = 0; i < tree.size(); i++) {
60 iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
65 iss >> numGenus; gobble(iss);
69 for (int i = 0; i < numGenus; i++) {
70 iss >> gnode >> gsize; gobble(iss);
72 uniqueTaxonomies[gnode] = gnode;
73 totals.push_back(gsize);
76 MPI_File_close(&inMPI);
79 in >> numNodes; gobble(in);
81 tree.resize(numNodes);
83 for (int i = 0; i < tree.size(); i++) {
84 in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
89 in >> numGenus; gobble(in);
93 for (int i = 0; i < numGenus; i++) {
94 in >> gnode >> gsize; gobble(in);
96 uniqueTaxonomies[gnode] = gnode;
97 totals.push_back(gsize);
105 catch(exception& e) {
106 m->errorOut(e, "PhyloTree", "PhyloTree");
110 /**************************************************************************************************/
112 PhyloTree::PhyloTree(string tfile){
114 m = MothurOut::getInstance();
117 tree.push_back(TaxNode("Root"));
118 tree[0].heirarchyID = "0";
125 int pid, num, processors;
126 vector<long> positions;
130 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
131 MPI_Comm_size(MPI_COMM_WORLD, &processors);
133 char inFileName[1024];
134 strcpy(inFileName, tfile.c_str());
136 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
139 positions = setFilePosEachLine(tfile, num);
141 //send file positions to all processes
142 for(int i = 1; i < processors; i++) {
143 MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
144 MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD);
147 MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
148 positions.resize(num+1);
149 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status);
153 for(int i=0;i<num;i++){
155 int length = positions[i+1] - positions[i];
156 char* buf4 = new char[length];
158 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
160 string tempBuf = buf4;
161 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
164 istringstream iss (tempBuf,istringstream::in);
166 addSeqToTree(name, tax);
169 MPI_File_close(&inMPI);
170 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
174 openInputFile(tfile, in);
176 //read in users taxonomy file and add sequences to tree
178 in >> name >> tax; gobble(in);
180 addSeqToTree(name, tax);
185 assignHeirarchyIDs(0);
187 //create file for summary if needed
190 catch(exception& e) {
191 m->errorOut(e, "PhyloTree", "PhyloTree");
196 /**************************************************************************************************/
198 string PhyloTree::getNextTaxon(string& heirarchy){
200 string currentLevel = "";
202 int pos = heirarchy.find_first_of(';');
203 currentLevel=heirarchy.substr(0,pos);
204 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
205 else { heirarchy = ""; }
209 catch(exception& e) {
210 m->errorOut(e, "PhyloTree", "getNextTaxon");
215 /**************************************************************************************************/
217 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
221 map<string, int>::iterator childPointer;
226 tree[0].accessions.push_back(seqName);
227 string taxon;// = getNextTaxon(seqTaxonomy);
229 while(seqTaxonomy != ""){
233 if (m->control_pressed) { return 0; }
235 //somehow the parent is getting one too many accnos
236 //use print to reassign the taxa id
237 taxon = getNextTaxon(seqTaxonomy);
239 childPointer = tree[currentNode].children.find(taxon);
241 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
242 currentNode = childPointer->second;
243 tree[currentNode].accessions.push_back(seqName);
244 name2Taxonomy[seqName] = currentNode;
246 else{ //otherwise, create it
247 tree.push_back(TaxNode(taxon));
249 tree[currentNode].children[taxon] = numNodes-1;
250 tree[numNodes-1].parent = currentNode;
252 // int numChildren = tree[currentNode].children.size();
253 // string heirarchyID = tree[currentNode].heirarchyID;
254 // tree[currentNode].accessions.push_back(seqName);
256 currentNode = tree[currentNode].children[taxon];
257 tree[currentNode].accessions.push_back(seqName);
258 name2Taxonomy[seqName] = currentNode;
259 // tree[currentNode].level = level;
260 // tree[currentNode].childNumber = numChildren;
261 // tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
264 if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; }
268 catch(exception& e) {
269 m->errorOut(e, "PhyloTree", "addSeqToTree");
273 /**************************************************************************************************/
274 vector<int> PhyloTree::getGenusNodes() {
277 //generate genusIndexes
278 map<int, int>::iterator it2;
279 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); }
283 catch(exception& e) {
284 m->errorOut(e, "PhyloTree", "getGenusNodes");
288 /**************************************************************************************************/
289 vector<int> PhyloTree::getGenusTotals() {
294 //reset counts because we are on a new word
295 for (int j = 0; j < genusIndex.size(); j++) {
296 totals.push_back(tree[genusIndex[j]].accessions.size());
304 catch(exception& e) {
305 m->errorOut(e, "PhyloTree", "getGenusNodes");
309 /**************************************************************************************************/
311 void PhyloTree::assignHeirarchyIDs(int index){
313 map<string,int>::iterator it;
316 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
317 tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
319 tree[it->second].level = tree[index].level + 1;
321 //save maxLevel for binning the unclassified seqs
322 if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
324 assignHeirarchyIDs(it->second);
327 catch(exception& e) {
328 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
332 /**************************************************************************************************/
333 void PhyloTree::setUp(string tfile){
335 string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
339 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
341 if (pid == 0) { binUnclassified(taxFileNameTest); }
344 //create file needed for summary if it doesn't exist
345 ifstream FileTest(taxFileNameTest.c_str());
348 binUnclassified(taxFileNameTest);
352 catch(exception& e) {
353 m->errorOut(e, "PhyloTree", "setUp");
357 /**************************************************************************************************/
358 void PhyloTree::binUnclassified(string file){
362 openOutputFile(file, out);
364 map<string, int>::iterator itBin;
365 map<string, int>::iterator childPointer;
367 vector<TaxNode> copy = tree;
370 fillOutTree(0, copy);
372 //get leaf nodes that may need externsion
373 for (int i = 0; i < copy.size(); i++) {
375 if (copy[i].children.size() == 0) {
380 int copyNodes = copy.size();
382 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
383 map<int, int>::iterator itLeaf;
384 for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) {
386 if (m->control_pressed) { out.close(); break; }
388 int level = copy[itLeaf->second].level;
389 int currentNode = itLeaf->second;
391 //this sequence is unclassified at some levels
392 while(level <= maxLevel){
396 string taxon = "unclassified";
398 //does the parent have a child names 'unclassified'?
399 childPointer = copy[currentNode].children.find(taxon);
401 if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on
402 currentNode = childPointer->second; //currentNode becomes 'unclassified'
404 else{ //otherwise, create it
405 copy.push_back(TaxNode(taxon));
407 copy[currentNode].children[taxon] = copyNodes-1;
408 copy[copyNodes-1].parent = currentNode;
409 copy[copyNodes-1].level = copy[currentNode].level + 1;
411 currentNode = copy[currentNode].children[taxon];
416 if (!m->control_pressed) {
422 catch(exception& e) {
423 m->errorOut(e, "PhyloTree", "binUnclassified");
427 /**************************************************************************************************/
428 void PhyloTree::fillOutTree(int index, vector<TaxNode>& copy) {
430 map<string,int>::iterator it;
432 it = copy[index].children.find("unclassified");
433 if (it == copy[index].children.end()) { //no unclassified at this level
434 string taxon = "unclassified";
435 copy.push_back(TaxNode(taxon));
436 copy[index].children[taxon] = copy.size()-1;
437 copy[copy.size()-1].parent = index;
438 copy[copy.size()-1].level = copy[index].level + 1;
441 if (tree[index].level <= maxLevel) {
442 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children
443 fillOutTree(it->second, copy);
448 catch(exception& e) {
449 m->errorOut(e, "PhyloTree", "fillOutTree");
453 /**************************************************************************************************/
454 string PhyloTree::getFullTaxonomy(string seqName) {
458 int currentNode = name2Taxonomy[seqName];
460 while (tree[currentNode].parent != -1) {
461 tax = tree[currentNode].name + ";" + tax;
462 currentNode = tree[currentNode].parent;
467 catch(exception& e) {
468 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
472 /**************************************************************************************************/
474 void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
476 out << copy.size() << endl;
478 for (int i = 0; i < copy.size(); i++) {
480 out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
482 map<string,int>::iterator it;
483 for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
484 out << it->first << '\t' << it->second << '\t';
491 catch(exception& e) {
492 m->errorOut(e, "PhyloTree", "print");
496 /**************************************************************************************************/
497 void PhyloTree::printTreeNodes(string treefilename) {
502 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
509 openOutputFile(treefilename, outTree);
512 outTree << tree.size() << endl;
513 for (int i = 0; i < tree.size(); i++) {
514 outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
518 outTree << endl << uniqueTaxonomies.size() << endl;
519 map<int, int>::iterator it2;
520 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
532 catch(exception& e) {
533 m->errorOut(e, "PhyloTree", "printTreeNodes");
537 /**************************************************************************************************/