2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
11 #include "referencedb.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, CountTable* c){
16 m = MothurOut::getInstance();
24 //check for necessary files
25 if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
26 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
27 ifstream FileTest(taxFileNameTest.c_str());
30 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
32 readTreeStruct(FileTest);
40 m->errorOut(e, "PhyloSummary", "PhyloSummary");
45 /**************************************************************************************************/
47 PhyloSummary::PhyloSummary(CountTable* c){
49 m = MothurOut::getInstance();
57 tree.push_back(rawTaxNode("Root"));
61 m->errorOut(e, "PhyloSummary", "PhyloSummary");
65 /**************************************************************************************************/
66 PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
68 m = MothurOut::getInstance();
76 //check for necessary files
77 if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
78 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
79 ifstream FileTest(taxFileNameTest.c_str());
82 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
84 readTreeStruct(FileTest);
92 m->errorOut(e, "PhyloSummary", "PhyloSummary");
97 /**************************************************************************************************/
99 PhyloSummary::PhyloSummary(GroupMap* g){
101 m = MothurOut::getInstance();
109 tree.push_back(rawTaxNode("Root"));
112 catch(exception& e) {
113 m->errorOut(e, "PhyloSummary", "PhyloSummary");
117 /**************************************************************************************************/
119 int PhyloSummary::summarize(string userTfile){
121 map<string, string> temp;
122 m->readTax(userTfile, temp);
124 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
125 addSeqToTree(itTemp->first, itTemp->second);
126 temp.erase(itTemp++);
131 catch(exception& e) {
132 m->errorOut(e, "PhyloSummary", "summarize");
137 /**************************************************************************************************/
139 string PhyloSummary::getNextTaxon(string& heirarchy){
141 string currentLevel = "";
143 int pos = heirarchy.find_first_of(';');
144 currentLevel=heirarchy.substr(0,pos);
145 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
146 else { heirarchy = ""; }
150 catch(exception& e) {
151 m->errorOut(e, "PhyloSummary", "getNextTaxon");
156 /**************************************************************************************************/
158 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
163 map<string, int>::iterator childPointer;
170 //are there confidence scores, if so remove them
171 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
173 while (seqTaxonomy != "") {
175 if (m->control_pressed) { return 0; }
177 //somehow the parent is getting one too many accnos
178 //use print to reassign the taxa id
179 taxon = getNextTaxon(seqTaxonomy);
181 childPointer = tree[currentNode].children.find(taxon);
183 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
186 if (groupmap != NULL) {
187 //find out the sequences group
188 string group = groupmap->getGroup(seqName);
190 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
192 //do you have a count for this group?
193 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
195 //if yes, increment it - there should not be a case where we can't find it since we load group in read
196 if (itGroup != tree[childPointer->second].groupCount.end()) {
197 tree[childPointer->second].groupCount[group]++;
199 }else if (ct != NULL) {
200 if (ct->hasGroupInfo()) {
201 vector<int> groupCounts = ct->getGroupCounts(seqName);
202 vector<string> groups = ct->getNamesOfGroups();
203 for (int i = 0; i < groups.size(); i++) {
205 if (groupCounts[i] != 0) {
206 //do you have a count for this group?
207 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
209 //if yes, increment it - there should not be a case where we can't find it since we load group in read
210 if (itGroup != tree[childPointer->second].groupCount.end()) {
211 tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
216 thisCount = ct->getNumSeqs(seqName);
219 tree[childPointer->second].total += thisCount;
221 currentNode = childPointer->second;
225 tree.push_back(rawTaxNode(taxon));
226 int index = tree.size() - 1;
228 tree[index].parent = currentNode;
229 tree[index].level = (level+1);
230 tree[currentNode].children[taxon] = index;
233 //initialize groupcounts
234 if (groupmap != NULL) {
235 vector<string> mGroups = groupmap->getNamesOfGroups();
236 for (int j = 0; j < mGroups.size(); j++) {
237 tree[index].groupCount[mGroups[j]] = 0;
240 //find out the sequences group
241 string group = groupmap->getGroup(seqName);
243 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
245 //do you have a count for this group?
246 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
248 //if yes, increment it - there should not be a case where we can't find it since we load group in read
249 if (itGroup != tree[index].groupCount.end()) {
250 tree[index].groupCount[group]++;
252 }else if (ct != NULL) {
253 if (ct->hasGroupInfo()) {
254 vector<string> mGroups = ct->getNamesOfGroups();
255 for (int j = 0; j < mGroups.size(); j++) {
256 tree[index].groupCount[mGroups[j]] = 0;
258 vector<int> groupCounts = ct->getGroupCounts(seqName);
259 vector<string> groups = ct->getNamesOfGroups();
261 for (int i = 0; i < groups.size(); i++) {
262 if (groupCounts[i] != 0) {
264 //do you have a count for this group?
265 map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
267 //if yes, increment it - there should not be a case where we can't find it since we load group in read
268 if (itGroup != tree[index].groupCount.end()) {
269 tree[index].groupCount[groups[i]]+=groupCounts[i];
274 thisCount = ct->getNumSeqs(seqName);
277 tree[index].total = thisCount;
280 }else{ //otherwise, error
281 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
288 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
289 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
294 catch(exception& e) {
295 m->errorOut(e, "PhyloSummary", "addSeqToTree");
299 /**************************************************************************************************/
301 int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
305 map<string, int>::iterator childPointer;
312 //are there confidence scores, if so remove them
313 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
315 while (seqTaxonomy != "") {
317 if (m->control_pressed) { return 0; }
319 //somehow the parent is getting one too many accnos
320 //use print to reassign the taxa id
321 taxon = getNextTaxon(seqTaxonomy);
323 childPointer = tree[currentNode].children.find(taxon);
325 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
326 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
327 if (itGroup->second == true) {
328 tree[childPointer->second].groupCount[itGroup->first]++;
332 tree[childPointer->second].total++;
334 currentNode = childPointer->second;
338 tree.push_back(rawTaxNode(taxon));
339 int index = tree.size() - 1;
341 tree[index].parent = currentNode;
342 tree[index].level = (level+1);
343 tree[index].total = 1;
344 tree[currentNode].children[taxon] = index;
346 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
347 if (itGroup->second == true) {
348 tree[index].groupCount[itGroup->first]++;
354 }else{ //otherwise, error
355 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
362 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
363 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
368 catch(exception& e) {
369 m->errorOut(e, "PhyloSummary", "addSeqToTree");
374 /**************************************************************************************************/
376 void PhyloSummary::assignRank(int index){
378 map<string,int>::iterator it;
381 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
382 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
385 assignRank(it->second);
388 catch(exception& e) {
389 m->errorOut(e, "PhyloSummary", "assignRank");
393 /**************************************************************************************************/
395 void PhyloSummary::print(ofstream& out){
398 if (ignore) { assignRank(0); }
399 vector<string> mGroups;
401 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
402 if (groupmap != NULL) {
403 //so the labels match the counts below, since the map sorts them automatically...
404 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
405 mGroups = groupmap->getNamesOfGroups();
406 for (int i = 0; i < mGroups.size(); i++) {
407 out << mGroups[i] << '\t';
409 }else if (ct != NULL) {
410 if (ct->hasGroupInfo()) {
411 mGroups = ct->getNamesOfGroups();
412 for (int i = 0; i < mGroups.size(); i++) {
413 out << mGroups[i] << '\t';
420 int totalChildrenInTree = 0;
421 map<string, int>::iterator itGroup;
423 map<string,int>::iterator it;
424 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
425 if (tree[it->second].total != 0) {
426 totalChildrenInTree++;
427 tree[0].total += tree[it->second].total;
429 if (groupmap != NULL) {
430 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
431 }else if ( ct != NULL) {
432 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
438 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
441 if (groupmap != NULL) {
442 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
443 }else if ( ct != NULL) {
444 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } }
452 catch(exception& e) {
453 m->errorOut(e, "PhyloSummary", "print");
458 /**************************************************************************************************/
460 void PhyloSummary::print(int i, ofstream& out){
462 map<string,int>::iterator it;
463 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
465 if (tree[it->second].total != 0) {
467 int totalChildrenInTree = 0;
469 map<string,int>::iterator it2;
470 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
471 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
474 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
476 map<string, int>::iterator itGroup;
477 if (groupmap != NULL) {
478 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
479 // out << itGroup->second << '\t';
481 vector<string> mGroups = groupmap->getNamesOfGroups();
482 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
483 }else if (ct != NULL) {
484 if (ct->hasGroupInfo()) {
485 vector<string> mGroups = ct->getNamesOfGroups();
486 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
493 print(it->second, out);
496 catch(exception& e) {
497 m->errorOut(e, "PhyloSummary", "print");
501 /**************************************************************************************************/
502 void PhyloSummary::readTreeStruct(ifstream& in){
506 string line = m->getline(in); m->gobble(in);
510 in >> num; m->gobble(in);
514 in >> maxLevel; m->gobble(in);
517 for (int i = 0; i < tree.size(); i++) {
519 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
524 for (int j = 0; j < num; j++) {
525 in >> childName >> childIndex;
526 tree[i].children[childName] = childIndex;
529 //initialize groupcounts
530 if (groupmap != NULL) {
531 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
532 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
534 }else if (ct != NULL) {
535 if (ct->hasGroupInfo()) {
536 for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
537 tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
546 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
550 catch(exception& e) {
551 m->errorOut(e, "PhyloSummary", "readTreeStruct");
555 /**************************************************************************************************/