X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=phylotree.cpp;fp=phylotree.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=a9ef6cb0d85f44bde915606eb452db993ec40d8d;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/phylotree.cpp b/phylotree.cpp deleted file mode 100644 index a9ef6cb..0000000 --- a/phylotree.cpp +++ /dev/null @@ -1,656 +0,0 @@ -/* - * doTaxonomy.cpp - * - * - * Created by Pat Schloss on 6/17/09. - * Copyright 2009 Patrick D. Schloss. All rights reserved. - * - */ - -#include "phylotree.h" - -/**************************************************************************************************/ - -PhyloTree::PhyloTree(){ - try { - m = MothurOut::getInstance(); - numNodes = 1; - numSeqs = 0; - tree.push_back(TaxNode("Root")); - tree[0].heirarchyID = "0"; - maxLevel = 0; - calcTotals = true; - addSeqToTree("unknown", "unknown;"); - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "PhyloTree"); - exit(1); - } -} -/**************************************************************************************************/ - -PhyloTree::PhyloTree(ifstream& in, string filename){ - try { - m = MothurOut::getInstance(); - calcTotals = false; - numNodes = 0; - numSeqs = 0; - - #ifdef USE_MPI - MPI_File inMPI; - MPI_Offset size; - MPI_Status status; - - char inFileName[1024]; - strcpy(inFileName, filename.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); - MPI_File_get_size(inMPI, &size); - - char* buffer = new char[size]; - MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); - - string tempBuf = buffer; - if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } - istringstream iss (tempBuf,istringstream::in); - delete buffer; - - //read version - m->getline(iss); m->gobble(iss); - - iss >> numNodes; m->gobble(iss); - - tree.resize(numNodes); - - for (int i = 0; i < tree.size(); i++) { - iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss); - } - - //read genus nodes - int numGenus = 0; - iss >> numGenus; m->gobble(iss); - - int gnode, gsize; - totals.clear(); - for (int i = 0; i < numGenus; i++) { - iss >> gnode >> gsize; m->gobble(iss); - - uniqueTaxonomies[gnode] = gnode; - totals.push_back(gsize); - } - - MPI_File_close(&inMPI); - - #else - //read version - string line = m->getline(in); m->gobble(in); - - in >> numNodes; m->gobble(in); - - tree.resize(numNodes); - - for (int i = 0; i < tree.size(); i++) { - in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in); - } - - //read genus nodes - int numGenus = 0; - in >> numGenus; m->gobble(in); - - int gnode, gsize; - totals.clear(); - for (int i = 0; i < numGenus; i++) { - in >> gnode >> gsize; m->gobble(in); - - uniqueTaxonomies[gnode] = gnode; - totals.push_back(gsize); - } - - in.close(); - - #endif - - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "PhyloTree"); - exit(1); - } -} -/**************************************************************************************************/ - -PhyloTree::PhyloTree(string tfile){ - try { - m = MothurOut::getInstance(); - numNodes = 1; - numSeqs = 0; - tree.push_back(TaxNode("Root")); - tree[0].heirarchyID = "0"; - maxLevel = 0; - calcTotals = true; - string name, tax; - - #ifdef USE_MPI - int pid, num, processors; - vector positions; - - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - - char inFileName[1024]; - strcpy(inFileName, tfile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - - if (pid == 0) { - positions = m->setFilePosEachLine(tfile, num); - - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); - MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD); - } - }else{ - MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); - positions.resize(num+1); - MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status); - } - - //read file - for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; - - istringstream iss (tempBuf,istringstream::in); - iss >> name >> tax; - addSeqToTree(name, tax); - } - - MPI_File_close(&inMPI); - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - - #else - ifstream in; - m->openInputFile(tfile, in); - - //read in users taxonomy file and add sequences to tree - while(!in.eof()){ - in >> name >> tax; m->gobble(in); - - addSeqToTree(name, tax); - } - in.close(); - #endif - - assignHeirarchyIDs(0); - - - string unknownTax = "unknown;"; - //added last taxon until you get desired level - for (int i = 1; i < maxLevel; i++) { - unknownTax += "unclassfied;"; - } - - addSeqToTree("unknown", unknownTax); - - //create file for summary if needed - setUp(tfile); - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "PhyloTree"); - exit(1); - } -} - -/**************************************************************************************************/ - -string PhyloTree::getNextTaxon(string& heirarchy, string seqname){ - try { - string currentLevel = ""; - if(heirarchy != ""){ - int pos = heirarchy.find_first_of(';'); - - if (pos == -1) { //you can't find another ; - currentLevel = heirarchy; - heirarchy = ""; - m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine(); - }else{ - currentLevel=heirarchy.substr(0,pos); - if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } - else { heirarchy = ""; } - } - - } - return currentLevel; - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "getNextTaxon"); - exit(1); - } -} - -/**************************************************************************************************/ - -int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ - try { - numSeqs++; - - map::iterator childPointer; - - int currentNode = 0; - int level = 1; - - tree[0].accessions.push_back(seqName); - m->removeConfidences(seqTaxonomy); - - string taxon;// = getNextTaxon(seqTaxonomy); - - while(seqTaxonomy != ""){ - - level++; - - if (m->control_pressed) { return 0; } - - //somehow the parent is getting one too many accnos - //use print to reassign the taxa id - taxon = getNextTaxon(seqTaxonomy, seqName); - - if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; } - - childPointer = tree[currentNode].children.find(taxon); - - if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on - currentNode = childPointer->second; - tree[currentNode].accessions.push_back(seqName); - name2Taxonomy[seqName] = currentNode; - } - else{ //otherwise, create it - tree.push_back(TaxNode(taxon)); - numNodes++; - tree[currentNode].children[taxon] = numNodes-1; - tree[numNodes-1].parent = currentNode; - - currentNode = tree[currentNode].children[taxon]; - tree[currentNode].accessions.push_back(seqName); - name2Taxonomy[seqName] = currentNode; - } - - if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; } - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "addSeqToTree"); - exit(1); - } -} -/**************************************************************************************************/ -vector PhyloTree::getGenusNodes() { - try { - genusIndex.clear(); - //generate genusIndexes - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); } - - return genusIndex; - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "getGenusNodes"); - exit(1); - } -} -/**************************************************************************************************/ -vector PhyloTree::getGenusTotals() { - try { - - if (calcTotals) { - totals.clear(); - //reset counts because we are on a new word - for (int j = 0; j < genusIndex.size(); j++) { - totals.push_back(tree[genusIndex[j]].accessions.size()); - } - return totals; - }else{ - return totals; - } - - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "getGenusNodes"); - exit(1); - } -} -/**************************************************************************************************/ - -void PhyloTree::assignHeirarchyIDs(int index){ - try { - map::iterator it; - int counter = 1; - - for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ - tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); - counter++; - tree[it->second].level = tree[index].level + 1; - - //save maxLevel for binning the unclassified seqs - if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } - - assignHeirarchyIDs(it->second); - } - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "assignHeirarchyIDs"); - exit(1); - } -} -/**************************************************************************************************/ -void PhyloTree::setUp(string tfile){ - try{ - string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum"; - - #ifdef USE_MPI - int pid; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { binUnclassified(taxFileNameTest); } - - #else - binUnclassified(taxFileNameTest); - #endif - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "setUp"); - exit(1); - } -} -/**************************************************************************************************/ -void PhyloTree::binUnclassified(string file){ - try { - - ofstream out; - m->openOutputFile(file, out); - - map::iterator itBin; - map::iterator childPointer; - - vector copy = tree; - - //fill out tree - fillOutTree(0, copy); - - //get leaf nodes that may need extension - for (int i = 0; i < copy.size(); i++) { - - if (copy[i].children.size() == 0) { - leafNodes[i] = i; - } - } - - int copyNodes = copy.size(); - - //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary - map::iterator itLeaf; - for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) { - - if (m->control_pressed) { out.close(); break; } - - int level = copy[itLeaf->second].level; - int currentNode = itLeaf->second; - - //this sequence is unclassified at some levels - while(level < maxLevel){ - - level++; - - string taxon = "unclassified"; - - //does the parent have a child names 'unclassified'? - childPointer = copy[currentNode].children.find(taxon); - - if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on - currentNode = childPointer->second; //currentNode becomes 'unclassified' - } - else{ //otherwise, create it - copy.push_back(TaxNode(taxon)); - copyNodes++; - copy[currentNode].children[taxon] = copyNodes-1; - copy[copyNodes-1].parent = currentNode; - copy[copyNodes-1].level = copy[currentNode].level + 1; - - currentNode = copy[currentNode].children[taxon]; - } - } - } - - if (!m->control_pressed) { - //print copy tree - print(out, copy); - } - - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "binUnclassified"); - exit(1); - } -} -/**************************************************************************************************/ -void PhyloTree::fillOutTree(int index, vector& copy) { - try { - - map::iterator it; - - it = copy[index].children.find("unclassified"); - if (it == copy[index].children.end()) { //no unclassified at this level - string taxon = "unclassified"; - copy.push_back(TaxNode(taxon)); - copy[index].children[taxon] = copy.size()-1; - copy[copy.size()-1].parent = index; - copy[copy.size()-1].level = copy[index].level + 1; - } - - if (tree[index].level < maxLevel) { - for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children - fillOutTree(it->second, copy); - } - } - - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "fillOutTree"); - exit(1); - } -} -/**************************************************************************************************/ -string PhyloTree::getFullTaxonomy(string seqName) { - try { - string tax = ""; - - int currentNode = name2Taxonomy[seqName]; - - while (tree[currentNode].parent != -1) { - tax = tree[currentNode].name + ";" + tax; - currentNode = tree[currentNode].parent; - } - - return tax; - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "getFullTaxonomy"); - exit(1); - } -} -/**************************************************************************************************/ - -void PhyloTree::print(ofstream& out, vector& copy){ - try { - - //output mothur version - out << "#" << m->getVersion() << endl; - - out << copy.size() << endl; - - out << maxLevel << endl; - - for (int i = 0; i < copy.size(); i++) { - - out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; - - map::iterator it; - for(it=copy[i].children.begin();it!=copy[i].children.end();it++){ - out << it->first << '\t' << it->second << '\t'; - } - out << endl; - } - - out.close(); - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "print"); - exit(1); - } -} -/**************************************************************************************************/ -void PhyloTree::printTreeNodes(string treefilename) { - try { - - #ifdef USE_MPI - int pid; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { - - #endif - - ofstream outTree; - m->openOutputFile(treefilename, outTree); - - //output mothur version - outTree << "#" << m->getVersion() << endl; - - //print treenodes - outTree << tree.size() << endl; - for (int i = 0; i < tree.size(); i++) { - outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl; - } - - //print genus nodes - outTree << endl << uniqueTaxonomies.size() << endl; - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } - outTree << endl; - - outTree.close(); - - #ifdef USE_MPI - } - #endif - - - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "printTreeNodes"); - exit(1); - } -} -/**************************************************************************************************/ -TaxNode PhyloTree::get(int i ){ - try { - if (i < tree.size()) { return tree[i]; } - else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "get"); - exit(1); - } -} -/**************************************************************************************************/ -TaxNode PhyloTree::get(string seqName){ - try { - map::iterator itFind = name2Taxonomy.find(seqName); - - if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; } - else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "get"); - exit(1); - } -} -/**************************************************************************************************/ -string PhyloTree::getName(int i ){ - try { - if (i < tree.size()) { return tree[i].name; } - else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "get"); - exit(1); - } -} -/**************************************************************************************************/ -int PhyloTree::getIndex(string seqName){ - try { - map::iterator itFind = name2Taxonomy.find(seqName); - - if (itFind != name2Taxonomy.end()) { return name2Taxonomy[seqName]; } - else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "get"); - exit(1); - } -} -/**************************************************************************************************/ -bool PhyloTree::ErrorCheck(vector templateFileNames){ - try { - - bool okay = true; - templateFileNames.push_back("unknown"); - - map::iterator itFind; - map taxonomyFileNames = name2Taxonomy; - - for (int i = 0; i < templateFileNames.size(); i++) { - itFind = taxonomyFileNames.find(templateFileNames[i]); - - if (itFind != taxonomyFileNames.end()) { //found it so erase it - taxonomyFileNames.erase(itFind); - }else { - m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine(); - okay = false; - } - - //templateFileNames.erase(templateFileNames.begin()+i); - //i--; - } - templateFileNames.clear(); - - if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template - okay = false; - - for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) { - m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine(); - } - } - - return okay; - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "ErrorCheck"); - exit(1); - } -} -/**************************************************************************************************/ - - - -