X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=phylosummary.cpp;h=9717087d72a55a6bc28ce2d77f02545a7eb5b612;hp=870f35fa3b994a78ed165d1179c8ee2d6daf207b;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=050220fe7822cc660615972a0054cf4a83eefbe4 diff --git a/phylosummary.cpp b/phylosummary.cpp index 870f35f..9717087 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -8,23 +8,74 @@ */ #include "phylosummary.h" - +#include "referencedb.h" /**************************************************************************************************/ -PhyloSummary::PhyloSummary(string refTfile, string groupFile){ +PhyloSummary::PhyloSummary(string refTfile, CountTable* c){ try { m = MothurOut::getInstance(); maxLevel = 0; + ignore = false; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + //check for necessary files + if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); } + string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); + ifstream FileTest(taxFileNameTest.c_str()); - if (groupFile != "") { - groupmap = new GroupMap(groupFile); - groupmap->readMap(); + if (!FileTest) { + m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1); }else{ - groupmap = NULL; + readTreeStruct(FileTest); } + + tree[0].rank = "0"; + assignRank(0); + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} + +/**************************************************************************************************/ + +PhyloSummary::PhyloSummary(CountTable* c){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = true; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "0"; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} +/**************************************************************************************************/ +PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = false; + numSeqs = 0; + + groupmap = g; + ct = NULL; //check for necessary files - string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"; + if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); } + string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); ifstream FileTest(taxFileNameTest.c_str()); if (!FileTest) { @@ -42,24 +93,40 @@ PhyloSummary::PhyloSummary(string refTfile, string groupFile){ exit(1); } } + /**************************************************************************************************/ -void PhyloSummary::summarize(string userTfile){ +PhyloSummary::PhyloSummary(GroupMap* g){ try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = true; + numSeqs = 0; - ifstream in; - openInputFile(userTfile, in); + groupmap = g; + ct = NULL; - //read in users taxonomy file and add sequences to tree - string name, tax; - while(!in.eof()){ - in >> name >> tax; gobble(in); - - addSeqToTree(name, tax); - - if (m->control_pressed) { break; } - } - in.close(); + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "0"; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} +/**************************************************************************************************/ + +int PhyloSummary::summarize(string userTfile){ + try { + map temp; + m->readTax(userTfile, temp); + + for (map::iterator itTemp = temp.begin(); itTemp != temp.end();) { + addSeqToTree(itTemp->first, itTemp->second); + temp.erase(itTemp++); + } + + return numSeqs; } catch(exception& e) { m->errorOut(e, "PhyloSummary", "summarize"); @@ -90,6 +157,7 @@ string PhyloSummary::getNextTaxon(string& heirarchy){ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ try { + numSeqs++; map::iterator childPointer; @@ -99,6 +167,9 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ int level = 0; + //are there confidence scores, if so remove them + if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); } + while (seqTaxonomy != "") { if (m->control_pressed) { return 0; } @@ -110,27 +181,106 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ childPointer = tree[currentNode].children.find(taxon); if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on - if (groupmap != NULL) { + int thisCount = 1; + + if (groupmap != NULL) { //find out the sequences group string group = groupmap->getGroup(seqName); - if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } + if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } //do you have a count for this group? - map::iterator itGroup = tree[currentNode].groupCount.find(group); + map::iterator itGroup = tree[childPointer->second].groupCount.find(group); //if yes, increment it - there should not be a case where we can't find it since we load group in read - if (itGroup != tree[currentNode].groupCount.end()) { - tree[currentNode].groupCount[group]++; + if (itGroup != tree[childPointer->second].groupCount.end()) { + tree[childPointer->second].groupCount[group]++; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + for (int i = 0; i < groups.size(); i++) { + + if (groupCounts[i] != 0) { + //do you have a count for this group? + map::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[childPointer->second].groupCount.end()) { + tree[childPointer->second].groupCount[groups[i]] += groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } - tree[currentNode].total++; + tree[childPointer->second].total += thisCount; currentNode = childPointer->second; - }else{ //otherwise, error - m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine(); - break; + }else{ + if (ignore) { + + tree.push_back(rawTaxNode(taxon)); + int index = tree.size() - 1; + + tree[index].parent = currentNode; + tree[index].level = (level+1); + tree[currentNode].children[taxon] = index; + int thisCount = 1; + + //initialize groupcounts + if (groupmap != NULL) { + vector mGroups = groupmap->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { + tree[index].groupCount[mGroups[j]] = 0; + } + + //find out the sequences group + string group = groupmap->getGroup(seqName); + + if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } + + //do you have a count for this group? + map::iterator itGroup = tree[index].groupCount.find(group); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[index].groupCount.end()) { + tree[index].groupCount[group]++; + } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { + tree[index].groupCount[mGroups[j]] = 0; + } + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + if (groupCounts[i] != 0) { + + //do you have a count for this group? + map::iterator itGroup = tree[index].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[index].groupCount.end()) { + tree[index].groupCount[groups[i]]+=groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } + + tree[index].total = thisCount; + currentNode = index; + + }else{ //otherwise, error + m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine(); + break; + } } level++; @@ -139,13 +289,88 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; } } } + return 0; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "addSeqToTree"); + exit(1); + } +} +/**************************************************************************************************/ +int PhyloSummary::addSeqToTree(string seqTaxonomy, map containsGroup){ + try { + numSeqs++; + + map::iterator childPointer; + + int currentNode = 0; + string taxon; + + int level = 0; + + //are there confidence scores, if so remove them + if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); } + + while (seqTaxonomy != "") { + + if (m->control_pressed) { return 0; } + + //somehow the parent is getting one too many accnos + //use print to reassign the taxa id + taxon = getNextTaxon(seqTaxonomy); + + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[childPointer->second].groupCount[itGroup->first]++; + } + } + + tree[childPointer->second].total++; + + currentNode = childPointer->second; + }else{ + if (ignore) { + + tree.push_back(rawTaxNode(taxon)); + int index = tree.size() - 1; + + tree[index].parent = currentNode; + tree[index].level = (level+1); + tree[index].total = 1; + tree[currentNode].children[taxon] = index; + + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[index].groupCount[itGroup->first]++; + } + } + + currentNode = index; + + }else{ //otherwise, error + m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine(); + break; + } + } + + level++; + + if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not. + for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; } + } + } + return 0; } catch(exception& e) { m->errorOut(e, "PhyloSummary", "addSeqToTree"); exit(1); } } + /**************************************************************************************************/ void PhyloSummary::assignRank(int index){ @@ -169,36 +394,55 @@ void PhyloSummary::assignRank(int index){ void PhyloSummary::print(ofstream& out){ try { + + if (ignore) { assignRank(0); } + vector mGroups; //print labels out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t"; if (groupmap != NULL) { //so the labels match the counts below, since the map sorts them automatically... //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); - - for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { - out << groupmap->namesOfGroups[i] << '\t'; + mGroups = groupmap->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { + out << mGroups[i] << '\t'; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { + out << mGroups[i] << '\t'; + } + } + } out << endl; int totalChildrenInTree = 0; + map::iterator itGroup; map::iterator it; for(it=tree[0].children.begin();it!=tree[0].children.end();it++){ - if (tree[it->second].total != 0) { totalChildrenInTree++; } + if (tree[it->second].total != 0) { + totalChildrenInTree++; + tree[0].total += tree[it->second].total; + + if (groupmap != NULL) { + for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } } + } + } } //print root out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t"; - map::iterator itGroup; + if (groupmap != NULL) { - //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { - // out << itGroup->second << '\t'; - //} - for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; } - } + for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } } + } out << endl; //print rest @@ -234,9 +478,16 @@ void PhyloSummary::print(int i, ofstream& out){ //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) { // out << itGroup->second << '\t'; //} - for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; } - } + vector mGroups = groupmap->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } + } + } out << endl; + } print(it->second, out); @@ -252,15 +503,15 @@ void PhyloSummary::readTreeStruct(ifstream& in){ try { //read version - string line = getline(in); gobble(in); + string line = m->getline(in); m->gobble(in); int num; - in >> num; gobble(in); + in >> num; m->gobble(in); tree.resize(num); - in >> maxLevel; gobble(in); + in >> maxLevel; m->gobble(in); //read the tree file for (int i = 0; i < tree.size(); i++) { @@ -277,25 +528,30 @@ void PhyloSummary::readTreeStruct(ifstream& in){ //initialize groupcounts if (groupmap != NULL) { - for (int j = 0; j < groupmap->namesOfGroups.size(); j++) { - tree[i].groupCount[groupmap->namesOfGroups[j]] = 0; + for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) { + tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) { + tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0; + } + } + } tree[i].total = 0; - gobble(in); + m->gobble(in); //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; } } } catch(exception& e) { - m->errorOut(e, "PhyloSummary", "print"); + m->errorOut(e, "PhyloSummary", "readTreeStruct"); exit(1); } } - /**************************************************************************************************/