X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=phylotree.cpp;h=b9bab4ec2934a4121305f87111cf2d238f5ec4d5;hp=8c48e4f1f783c9387c83d4e5d1bfdbb475eb09cf;hb=df7e3ff9f68ef157b0328a2d353c3258c5d45d89;hpb=cd9dbd8b53bbe32af3e9c6bead4aa6d796a278a5 diff --git a/phylotree.cpp b/phylotree.cpp index 8c48e4f..b9bab4e 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -20,6 +20,7 @@ PhyloTree::PhyloTree(){ tree[0].heirarchyID = "0"; maxLevel = 0; calcTotals = true; + addSeqToTree("unknown", "unknown;"); } catch(exception& e) { m->errorOut(e, "PhyloTree", "PhyloTree"); @@ -32,6 +33,8 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ try { m = MothurOut::getInstance(); calcTotals = false; + numNodes = 0; + numSeqs = 0; #ifdef USE_MPI MPI_File inMPI; @@ -52,48 +55,54 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ istringstream iss (tempBuf,istringstream::in); delete buffer; - iss >> numNodes; gobble(iss); + //read version + m->getline(iss); m->gobble(iss); + + iss >> numNodes; m->gobble(iss); tree.resize(numNodes); for (int i = 0; i < tree.size(); i++) { - iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss); + iss >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(iss); } //read genus nodes int numGenus = 0; - iss >> numGenus; gobble(iss); + iss >> numGenus; m->gobble(iss); int gnode, gsize; totals.clear(); for (int i = 0; i < numGenus; i++) { - iss >> gnode >> gsize; gobble(iss); + iss >> gnode >> gsize; m->gobble(iss); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } MPI_File_close(&inMPI); #else - in >> numNodes; gobble(in); + //read version + string line = m->getline(in); m->gobble(in); + + in >> numNodes; m->gobble(in); tree.resize(numNodes); for (int i = 0; i < tree.size(); i++) { - in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in); + in >> tree[i].name >> tree[i].level >> tree[i].parent; m->gobble(in); } //read genus nodes int numGenus = 0; - in >> numGenus; gobble(in); + in >> numGenus; m->gobble(in); int gnode, gsize; totals.clear(); for (int i = 0; i < numGenus; i++) { - in >> gnode >> gsize; gobble(in); + in >> gnode >> gsize; m->gobble(in); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } @@ -119,15 +128,15 @@ PhyloTree::PhyloTree(string tfile){ maxLevel = 0; calcTotals = true; string name, tax; - #ifdef USE_MPI - int pid, num; - vector positions; + int pid, num, processors; + vector positions; MPI_Status status; MPI_File inMPI; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); char inFileName[1024]; strcpy(inFileName, tfile.c_str()); @@ -135,15 +144,17 @@ PhyloTree::PhyloTree(string tfile){ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer if (pid == 0) { - positions = setFilePosEachLine(tfile, num); + positions = m->setFilePosEachLine(tfile, num); //send file positions to all processes - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + for(int i = 1; i < processors; i++) { + MPI_Send(&num, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&positions[0], (num+1), MPI_LONG, i, 2001, MPI_COMM_WORLD); + } }else{ - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs - positions.resize(num); - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + MPI_Recv(&num, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + positions.resize(num+1); + MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, 2001, MPI_COMM_WORLD, &status); } //read file @@ -164,22 +175,29 @@ PhyloTree::PhyloTree(string tfile){ } MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - ifstream in; - openInputFile(tfile, in); - - //read in users taxonomy file and add sequences to tree - while(!in.eof()){ - in >> name >> tax; gobble(in); - - addSeqToTree(name, tax); - } - in.close(); + map temp; + m->readTax(tfile, temp); + + for (map::iterator itTemp = temp.begin(); itTemp != temp.end();) { + addSeqToTree(itTemp->first, itTemp->second); + temp.erase(itTemp++); + } #endif - + assignHeirarchyIDs(0); - + + + string unknownTax = "unknown;"; + //added last taxon until you get desired level + for (int i = 1; i < maxLevel; i++) { + unknownTax += "unclassfied;"; + } + + addSeqToTree("unknown", unknownTax); + //create file for summary if needed setUp(tfile); } @@ -191,14 +209,22 @@ PhyloTree::PhyloTree(string tfile){ /**************************************************************************************************/ -string PhyloTree::getNextTaxon(string& heirarchy){ +string PhyloTree::getNextTaxon(string& heirarchy, string seqname){ try { string currentLevel = ""; if(heirarchy != ""){ int pos = heirarchy.find_first_of(';'); - currentLevel=heirarchy.substr(0,pos); - if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } - else { heirarchy = ""; } + + if (pos == -1) { //you can't find another ; + currentLevel = heirarchy; + heirarchy = ""; + m->mothurOut(seqname + " is missing a ;, please check for other errors."); m->mothurOutEndLine(); + }else{ + currentLevel=heirarchy.substr(0,pos); + if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } + else { heirarchy = ""; } + } + } return currentLevel; } @@ -220,17 +246,23 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ int level = 1; tree[0].accessions.push_back(seqName); - string taxon;// = getNextTaxon(seqTaxonomy); + m->removeConfidences(seqTaxonomy); + string taxon;// = getNextTaxon(seqTaxonomy); + while(seqTaxonomy != ""){ level++; - + if (m->control_pressed) { return 0; } //somehow the parent is getting one too many accnos //use print to reassign the taxa id - taxon = getNextTaxon(seqTaxonomy); + taxon = getNextTaxon(seqTaxonomy, seqName); + + if (m->debug) { m->mothurOut(seqName +'\t' + taxon +'\n'); } + + if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies.insert(currentNode); } break; } childPointer = tree[currentNode].children.find(taxon); @@ -245,21 +277,15 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ tree[currentNode].children[taxon] = numNodes-1; tree[numNodes-1].parent = currentNode; - // int numChildren = tree[currentNode].children.size(); - // string heirarchyID = tree[currentNode].heirarchyID; - // tree[currentNode].accessions.push_back(seqName); - currentNode = tree[currentNode].children[taxon]; tree[currentNode].accessions.push_back(seqName); name2Taxonomy[seqName] = currentNode; - // tree[currentNode].level = level; - // tree[currentNode].childNumber = numChildren; - // tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber); } - - if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; } + + if (seqTaxonomy == "") { uniqueTaxonomies.insert(currentNode); } } - + + return 0; } catch(exception& e) { m->errorOut(e, "PhyloTree", "addSeqToTree"); @@ -271,9 +297,16 @@ vector PhyloTree::getGenusNodes() { try { genusIndex.clear(); //generate genusIndexes - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); } + set::iterator it2; + map temp; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(*it2); temp[*it2] = genusIndex.size()-1; } + for (map::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) { + map::iterator itTemp = temp.find(itName->second); + if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; } + else { m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; } + } + return genusIndex; } catch(exception& e) { @@ -310,6 +343,9 @@ void PhyloTree::assignHeirarchyIDs(int index){ int counter = 1; for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + + if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); } + tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); counter++; tree[it->second].level = tree[index].level + 1; @@ -337,12 +373,7 @@ void PhyloTree::setUp(string tfile){ if (pid == 0) { binUnclassified(taxFileNameTest); } #else - //create file needed for summary if it doesn't exist - ifstream FileTest(taxFileNameTest.c_str()); - - if (!FileTest) { - binUnclassified(taxFileNameTest); - } + binUnclassified(taxFileNameTest); #endif } catch(exception& e) { @@ -355,26 +386,44 @@ void PhyloTree::binUnclassified(string file){ try { ofstream out; - openOutputFile(file, out); + m->openOutputFile(file, out); map::iterator itBin; map::iterator childPointer; vector copy = tree; - int copyNodes = numNodes; + //fill out tree + fillOutTree(0, copy); + + //get leaf nodes that may need extension + for (int i = 0; i < copy.size(); i++) { + + if (copy[i].children.size() == 0) { + leafNodes[i] = i; + } + } + + if (m->debug) { m->mothurOut("maxLevel = " + toString(maxLevel) +'\n'); } + + int copyNodes = copy.size(); + //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary - for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) { + map::iterator itLeaf; + for (itLeaf = leafNodes.begin(); itLeaf != leafNodes.end(); itLeaf++) { if (m->control_pressed) { out.close(); break; } - int level = copy[itBin->second].level; - int currentNode = itBin->second; + int level = copy[itLeaf->second].level; + int currentNode = itLeaf->second; + + if (m->debug) { m->mothurOut(copy[currentNode].name +'\n'); } //this sequence is unclassified at some levels - while(level != maxLevel){ - + while(level < maxLevel){ + level++; + if (m->debug) { m->mothurOut("level = " + toString(level) +'\n'); } string taxon = "unclassified"; @@ -383,7 +432,6 @@ void PhyloTree::binUnclassified(string file){ if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on currentNode = childPointer->second; //currentNode becomes 'unclassified' - copy[currentNode].accessions.push_back(itBin->first); //add this seq } else{ //otherwise, create it copy.push_back(TaxNode(taxon)); @@ -393,7 +441,6 @@ void PhyloTree::binUnclassified(string file){ copy[copyNodes-1].level = copy[currentNode].level + 1; currentNode = copy[currentNode].children[taxon]; - copy[currentNode].accessions.push_back(itBin->first); } } } @@ -410,6 +457,33 @@ void PhyloTree::binUnclassified(string file){ } } /**************************************************************************************************/ +void PhyloTree::fillOutTree(int index, vector& copy) { + try { + + map::iterator it; + + it = copy[index].children.find("unclassified"); + if (it == copy[index].children.end()) { //no unclassified at this level + string taxon = "unclassified"; + copy.push_back(TaxNode(taxon)); + copy[index].children[taxon] = copy.size()-1; + copy[copy.size()-1].parent = index; + copy[copy.size()-1].level = copy[index].level + 1; + } + + if (tree[index].level < maxLevel) { + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ //check your children + fillOutTree(it->second, copy); + } + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "fillOutTree"); + exit(1); + } +} +/**************************************************************************************************/ string PhyloTree::getFullTaxonomy(string seqName) { try { string tax = ""; @@ -432,10 +506,16 @@ string PhyloTree::getFullTaxonomy(string seqName) { void PhyloTree::print(ofstream& out, vector& copy){ try { + + //output mothur version + out << "#" << m->getVersion() << endl; + out << copy.size() << endl; + out << maxLevel << endl; + for (int i = 0; i < copy.size(); i++) { - + out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; map::iterator it; @@ -465,7 +545,10 @@ void PhyloTree::printTreeNodes(string treefilename) { #endif ofstream outTree; - openOutputFile(treefilename, outTree); + m->openOutputFile(treefilename, outTree); + + //output mothur version + outTree << "#" << m->getVersion() << endl; //print treenodes outTree << tree.size() << endl; @@ -475,11 +558,10 @@ void PhyloTree::printTreeNodes(string treefilename) { //print genus nodes outTree << endl << uniqueTaxonomies.size() << endl; - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + set::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl; } outTree << endl; - outTree.close(); #ifdef USE_MPI @@ -494,6 +576,97 @@ void PhyloTree::printTreeNodes(string treefilename) { } } /**************************************************************************************************/ +TaxNode PhyloTree::get(int i ){ + try { + if (i < tree.size()) { return tree[i]; } + else { cout << i << '\t' << tree.size() << endl ; m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +TaxNode PhyloTree::get(string seqName){ + try { + map::iterator itFind = name2Taxonomy.find(seqName); + + if (itFind != name2Taxonomy.end()) { return tree[name2Taxonomy[seqName]]; } + else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +string PhyloTree::getName(int i ){ + try { + if (i < tree.size()) { return tree[i].name; } + else { m->mothurOut("Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1); } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +int PhyloTree::getGenusIndex(string seqName){ + try { + map::iterator itFind = name2GenusNodeIndex.find(seqName); + + if (itFind != name2GenusNodeIndex.end()) { return itFind->second; } + else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "get"); + exit(1); + } +} +/**************************************************************************************************/ +bool PhyloTree::ErrorCheck(vector templateFileNames){ + try { + + bool okay = true; + templateFileNames.push_back("unknown"); + + map::iterator itFind; + map taxonomyFileNames = name2Taxonomy; + + if (m->debug) { m->mothurOut("[DEBUG]: in error check. Numseqs in template = " + toString(templateFileNames.size()) + ". Numseqs in taxonomy = " + toString(taxonomyFileNames.size()) + ".\n"); } + + for (int i = 0; i < templateFileNames.size(); i++) { + itFind = taxonomyFileNames.find(templateFileNames[i]); + + if (itFind != taxonomyFileNames.end()) { //found it so erase it + taxonomyFileNames.erase(itFind); + }else { + m->mothurOut("'" +templateFileNames[i] + "' is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine(); + okay = false; + } + + //templateFileNames.erase(templateFileNames.begin()+i); + //i--; + } + templateFileNames.clear(); + + if (taxonomyFileNames.size() > 0) { //there are names in tax file that are not in template + okay = false; + + for (itFind = taxonomyFileNames.begin(); itFind != taxonomyFileNames.end(); itFind++) { + m->mothurOut(itFind->first + " is in your taxonomy file and is not in your template file. Please correct."); m->mothurOutEndLine(); + } + } + + return okay; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "ErrorCheck"); + exit(1); + } +} +/**************************************************************************************************/ +