X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=phylotree.cpp;fp=phylotree.cpp;h=a9ef6cb0d85f44bde915606eb452db993ec40d8d;hb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc;hp=0000000000000000000000000000000000000000;hpb=1b73ff67c83892a025e597dabd9df6fe7b58206a;p=mothur.git diff --git a/phylotree.cpp b/phylotree.cpp new file mode 100644 index 0000000..a9ef6cb --- /dev/null +++ b/phylotree.cpp @@ -0,0 +1,656 @@ +/* + * doTaxonomy.cpp + * + * + * Created by Pat Schloss on 6/17/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "phylotree.h" + +/**************************************************************************************************/ + +PhyloTree::PhyloTree(){ + try { + m = MothurOut::getInstance(); + numNodes = 1; + numSeqs = 0; + tree.push_back(TaxNode("Root")); + tree[0].heirarchyID = "0"; + maxLevel = 0; + calcTotals = true; + addSeqToTree("unknown", "unknown;"); + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "PhyloTree"); + exit(1); + } +} +/**************************************************************************************************/ + +PhyloTree::PhyloTree(ifstream& in, string filename){ + try { + m = MothurOut::getInstance(); + calcTotals = false; + numNodes = 0; + numSeqs = 0; + + #ifdef USE_MPI + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[1024]; + strcpy(inFileName, filename.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + delete buffer; + + //read version + m->getline(iss); m->gobble(iss); + + iss >> numNodes; m->gobble(iss); + + tree.resize(numNodes); + + for (int i = 0; i < tree.size(); i++) { + iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss); + } + + //read genus nodes + int numGenus = 0; + iss >> numGenus; m->gobble(iss); + + int gnode, gsize; + totals.clear(); + for (int i = 0; i < numGenus; i++) { + iss >> gnode >> gsize; m->gobble(iss); + + uniqueTaxonomies[gnode] = gnode; + totals.push_back(gsize); + } + + MPI_File_close(&inMPI); + + #else + //read version + string line = m->getline(in); m->gobble(in); + + in >> numNodes; m->gobble(in); + + tree.resize(numNodes); + + for (int i = 0; i < tree.size(); i++) { + in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in); + } + + //read genus nodes + int numGenus = 0; + in >> numGenus; m->gobble(in); + + int gnode, gsize; + totals.clear(); + for (int i = 0; i < numGenus; i++) { + in >> gnode >> gsize; m->gobble(in); + + uniqueTaxonomies[gnode] = gnode; + totals.push_back(gsize); + } + + in.close(); + + #endif + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "PhyloTree"); + exit(1); + } +} +/**************************************************************************************************/ + +PhyloTree::PhyloTree(string tfile){ + try { + m = MothurOut::getInstance(); + numNodes = 1; + numSeqs = 0; + tree.push_back(TaxNode("Root")); + tree[0].heirarchyID = "0"; + maxLevel = 0; + calcTotals = true; + string name, tax; + + #ifdef USE_MPI + int pid, num, processors; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + char inFileName[1024]; + strcpy(inFileName, tfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = m->setFilePosEachLine(tfile, num); + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + positions.resize(num+1); + MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + iss >> name >> tax; + addSeqToTree(name, tax); + } + + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + + #else + ifstream in; + m->openInputFile(tfile, in); + + //read in users taxonomy file and add sequences to tree + while(!in.eof()){ + in >> name >> tax; m->gobble(in); + + addSeqToTree(name, tax); + } + in.close(); + #endif + + assignHeirarchyIDs(0); + + + string unknownTax = "unknown;"; + //added last taxon until you get desired level + for (int i = 1; i < maxLevel; i++) { + unknownTax += "unclassfied;"; + } + + addSeqToTree("unknown", unknownTax); + + //create file for summary if needed + setUp(tfile); + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "PhyloTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +string PhyloTree::getNextTaxon(string& heirarchy, string seqname){ + try { + string currentLevel = ""; + if(heirarchy != ""){ + int pos = heirarchy.find_first_of(';'); + + if (pos == -1) { //you can't find another ; + currentLevel = heirarchy; + heirarchy = ""; + m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine(); + }else{ + currentLevel=heirarchy.substr(0,pos); + if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } + else { heirarchy = ""; } + } + + } + return currentLevel; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "getNextTaxon"); + exit(1); + } +} + +/**************************************************************************************************/ + +int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ + try { + numSeqs++; + + map::iterator childPointer; + + int currentNode = 0; + int level = 1; + + tree[0].accessions.push_back(seqName); + m->removeConfidences(seqTaxonomy); + + string taxon;// = getNextTaxon(seqTaxonomy); + + while(seqTaxonomy != ""){ + + level++; + + if (m->control_pressed) { return 0; } + + //somehow the parent is getting one too many accnos + //use print to reassign the taxa id + taxon = getNextTaxon(seqTaxonomy, seqName); + + if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; } + + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on + currentNode = childPointer->second; + tree[currentNode].accessions.push_back(seqName); + name2Taxonomy[seqName] = currentNode; + } + else{ //otherwise, create it + tree.push_back(TaxNode(taxon)); + numNodes++; + tree[currentNode].children[taxon] = numNodes-1; + tree[numNodes-1].parent = currentNode; + + currentNode = tree[currentNode].children[taxon]; + tree[currentNode].accessions.push_back(seqName); + name2Taxonomy[seqName] = currentNode; + } + + if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "addSeqToTree"); + exit(1); + } +} +/**************************************************************************************************/ +vector PhyloTree::getGenusNodes() { + try { + genusIndex.clear(); + //generate genusIndexes + map::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); } + + return genusIndex; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "getGenusNodes"); + exit(1); + } +} +/**************************************************************************************************/ +vector PhyloTree::getGenusTotals() { + try { + + if (calcTotals) { + totals.clear(); + //reset counts because we are on a new word + for (int j = 0; j < genusIndex.size(); j++) { + totals.push_back(tree[genusIndex[j]].accessions.size()); + } + return totals; + }else{ + return totals; + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "getGenusNodes"); + exit(1); + } +} +/**************************************************************************************************/ + +void PhyloTree::assignHeirarchyIDs(int index){ + try { + map::iterator it; + int counter = 1; + + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); + counter++; + tree[it->second].level = tree[index].level + 1; + + //save maxLevel for binning the unclassified seqs + if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } + + assignHeirarchyIDs(it->second); + } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "assignHeirarchyIDs"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::setUp(string tfile){ + try{ + string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum"; + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { binUnclassified(taxFileNameTest); } + + #else + binUnclassified(taxFileNameTest); + #endif + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "setUp"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::binUnclassified(string file){ + try { + + ofstream out; + m->openOutputFile(file, out); + + map::iterator itBin; + map::iterator childPointer; + + vector copy = tree; + + //fill out tree + fillOutTree(0, copy); + + //get leaf nodes that may need extension + for (int i = 0; i < copy.size(); i++) { + + if (copy[i].children.size() == 0) { + leafNodes[i] = i; + } + } + + int copyNodes = copy.size(); + + //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary + map::iterator itLeaf; + for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) { + + if (m->control_pressed) { out.close(); break; } + + int level = copy[itLeaf->second].level; + int currentNode = itLeaf->second; + + //this sequence is unclassified at some levels + while(level < maxLevel){ + + level++; + + string taxon = "unclassified"; + + //does the parent have a child names 'unclassified'? + childPointer = copy[currentNode].children.find(taxon); + + if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on + currentNode = childPointer->second; //currentNode becomes 'unclassified' + } + else{ //otherwise, create it + copy.push_back(TaxNode(taxon)); + copyNodes++; + copy[currentNode].children[taxon] = copyNodes-1; + copy[copyNodes-1].parent = currentNode; + copy[copyNodes-1].level = copy[currentNode].level + 1; + + currentNode = copy[currentNode].children[taxon]; + } + } + } + + if (!m->control_pressed) { + //print copy tree + print(out, copy); + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "binUnclassified"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::fillOutTree(int index, vector& copy) { + try { + + map::iterator it; + + it = copy[index].children.find("unclassified"); + if (it == copy[index].children.end()) { //no unclassified at this level + string taxon = "unclassified"; + copy.push_back(TaxNode(taxon)); + copy[index].children[taxon] = copy.size()-1; + copy[copy.size()-1].parent = index; + copy[copy.size()-1].level = copy[index].level + 1; + } + + if (tree[index].level < maxLevel) { + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children + fillOutTree(it->second, copy); + } + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "fillOutTree"); + exit(1); + } +} +/**************************************************************************************************/ +string PhyloTree::getFullTaxonomy(string seqName) { + try { + string tax = ""; + + int currentNode = name2Taxonomy[seqName]; + + while (tree[currentNode].parent != -1) { + tax = tree[currentNode].name + ";" + tax; + currentNode = tree[currentNode].parent; + } + + return tax; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "getFullTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + +void PhyloTree::print(ofstream& out, vector& copy){ + try { + + //output mothur version + out << "#" << m->getVersion() << endl; + + out << copy.size() << endl; + + out << maxLevel << endl; + + for (int i = 0; i < copy.size(); i++) { + + out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; + + map::iterator it; + for(it=copy[i].children.begin();it!=copy[i].children.end();it++){ + out << it->first << '\t' << it->second << '\t'; + } + out << endl; + } + + out.close(); + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "print"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::printTreeNodes(string treefilename) { + try { + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + + #endif + + ofstream outTree; + m->openOutputFile(treefilename, outTree); + + //output mothur version + outTree << "#" << m->getVersion() << endl; + + //print treenodes + outTree << tree.size() << endl; + for (int i = 0; i < tree.size(); i++) { + outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl; + } + + //print genus nodes + outTree << endl << uniqueTaxonomies.size() << endl; + map::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + outTree << endl; + + outTree.close(); + + #ifdef USE_MPI + } + #endif + + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "printTreeNodes"); + exit(1); + } +} +/**************************************************************************************************/ +TaxNode PhyloTree::get(int i ){ + try { + if (i < tree.size()) { return tree[i]; } + else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +TaxNode PhyloTree::get(string seqName){ + try { + map::iterator itFind = name2Taxonomy.find(seqName); + + if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; } + else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +string PhyloTree::getName(int i ){ + try { + if (i < tree.size()) { return tree[i].name; } + else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +int PhyloTree::getIndex(string seqName){ + try { + map::iterator itFind = name2Taxonomy.find(seqName); + + if (itFind != name2Taxonomy.end()) { return name2Taxonomy[seqName]; } + else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +bool PhyloTree::ErrorCheck(vector templateFileNames){ + try { + + bool okay = true; + templateFileNames.push_back("unknown"); + + map::iterator itFind; + map taxonomyFileNames = name2Taxonomy; + + for (int i = 0; i < templateFileNames.size(); i++) { + itFind = taxonomyFileNames.find(templateFileNames[i]); + + if (itFind != taxonomyFileNames.end()) { //found it so erase it + taxonomyFileNames.erase(itFind); + }else { + m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine(); + okay = false; + } + + //templateFileNames.erase(templateFileNames.begin()+i); + //i--; + } + templateFileNames.clear(); + + if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template + okay = false; + + for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) { + m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine(); + } + } + + return okay; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "ErrorCheck"); + exit(1); + } +} +/**************************************************************************************************/ + + + +