2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
11 #include "referencedb.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, CountTable* c, bool r){
16 m = MothurOut::getInstance();
25 //check for necessary files
26 if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
27 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
28 ifstream FileTest(taxFileNameTest.c_str());
31 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33 readTreeStruct(FileTest);
41 m->errorOut(e, "PhyloSummary", "PhyloSummary");
46 /**************************************************************************************************/
48 PhyloSummary::PhyloSummary(CountTable* c, bool r){
50 m = MothurOut::getInstance();
59 tree.push_back(rawTaxNode("Root"));
63 m->errorOut(e, "PhyloSummary", "PhyloSummary");
67 /**************************************************************************************************/
68 PhyloSummary::PhyloSummary(string refTfile, GroupMap* g, bool r){
70 m = MothurOut::getInstance();
79 //check for necessary files
80 if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
81 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
82 ifstream FileTest(taxFileNameTest.c_str());
85 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
87 readTreeStruct(FileTest);
95 m->errorOut(e, "PhyloSummary", "PhyloSummary");
100 /**************************************************************************************************/
102 PhyloSummary::PhyloSummary(GroupMap* g, bool r){
104 m = MothurOut::getInstance();
113 tree.push_back(rawTaxNode("Root"));
116 catch(exception& e) {
117 m->errorOut(e, "PhyloSummary", "PhyloSummary");
121 /**************************************************************************************************/
123 int PhyloSummary::summarize(string userTfile){
125 map<string, string> temp;
126 m->readTax(userTfile, temp);
128 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
129 addSeqToTree(itTemp->first, itTemp->second);
130 temp.erase(itTemp++);
135 catch(exception& e) {
136 m->errorOut(e, "PhyloSummary", "summarize");
141 /**************************************************************************************************/
143 string PhyloSummary::getNextTaxon(string& heirarchy){
145 string currentLevel = "";
147 int pos = heirarchy.find_first_of(';');
148 currentLevel=heirarchy.substr(0,pos);
149 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
150 else { heirarchy = ""; }
154 catch(exception& e) {
155 m->errorOut(e, "PhyloSummary", "getNextTaxon");
160 /**************************************************************************************************/
162 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
167 map<string, int>::iterator childPointer;
174 //are there confidence scores, if so remove them
175 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
177 while (seqTaxonomy != "") {
179 if (m->control_pressed) { return 0; }
181 //somehow the parent is getting one too many accnos
182 //use print to reassign the taxa id
183 taxon = getNextTaxon(seqTaxonomy);
185 childPointer = tree[currentNode].children.find(taxon);
187 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
190 if (groupmap != NULL) {
191 //find out the sequences group
192 string group = groupmap->getGroup(seqName);
194 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
196 //do you have a count for this group?
197 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
199 //if yes, increment it - there should not be a case where we can't find it since we load group in read
200 if (itGroup != tree[childPointer->second].groupCount.end()) {
201 tree[childPointer->second].groupCount[group]++;
203 }else if (ct != NULL) {
204 if (ct->hasGroupInfo()) {
205 vector<int> groupCounts = ct->getGroupCounts(seqName);
206 vector<string> groups = ct->getNamesOfGroups();
207 for (int i = 0; i < groups.size(); i++) {
209 if (groupCounts[i] != 0) {
210 //do you have a count for this group?
211 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
213 //if yes, increment it - there should not be a case where we can't find it since we load group in read
214 if (itGroup != tree[childPointer->second].groupCount.end()) {
215 tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
220 thisCount = ct->getNumSeqs(seqName);
223 tree[childPointer->second].total += thisCount;
225 currentNode = childPointer->second;
229 tree.push_back(rawTaxNode(taxon));
230 int index = tree.size() - 1;
232 tree[index].parent = currentNode;
233 tree[index].level = (level+1);
234 tree[currentNode].children[taxon] = index;
237 //initialize groupcounts
238 if (groupmap != NULL) {
239 vector<string> mGroups = groupmap->getNamesOfGroups();
240 for (int j = 0; j < mGroups.size(); j++) {
241 tree[index].groupCount[mGroups[j]] = 0;
244 //find out the sequences group
245 string group = groupmap->getGroup(seqName);
247 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
249 //do you have a count for this group?
250 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
252 //if yes, increment it - there should not be a case where we can't find it since we load group in read
253 if (itGroup != tree[index].groupCount.end()) {
254 tree[index].groupCount[group]++;
256 }else if (ct != NULL) {
257 if (ct->hasGroupInfo()) {
258 vector<string> mGroups = ct->getNamesOfGroups();
259 for (int j = 0; j < mGroups.size(); j++) {
260 tree[index].groupCount[mGroups[j]] = 0;
262 vector<int> groupCounts = ct->getGroupCounts(seqName);
263 vector<string> groups = ct->getNamesOfGroups();
265 for (int i = 0; i < groups.size(); i++) {
266 if (groupCounts[i] != 0) {
268 //do you have a count for this group?
269 map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
271 //if yes, increment it - there should not be a case where we can't find it since we load group in read
272 if (itGroup != tree[index].groupCount.end()) {
273 tree[index].groupCount[groups[i]]+=groupCounts[i];
278 thisCount = ct->getNumSeqs(seqName);
281 tree[index].total = thisCount;
284 }else{ //otherwise, error
285 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
292 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
293 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
298 catch(exception& e) {
299 m->errorOut(e, "PhyloSummary", "addSeqToTree");
303 /**************************************************************************************************/
305 int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
309 map<string, int>::iterator childPointer;
316 //are there confidence scores, if so remove them
317 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
319 while (seqTaxonomy != "") {
321 if (m->control_pressed) { return 0; }
323 //somehow the parent is getting one too many accnos
324 //use print to reassign the taxa id
325 taxon = getNextTaxon(seqTaxonomy);
327 childPointer = tree[currentNode].children.find(taxon);
329 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
330 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
331 if (itGroup->second == true) {
332 tree[childPointer->second].groupCount[itGroup->first]++;
336 tree[childPointer->second].total++;
338 currentNode = childPointer->second;
342 tree.push_back(rawTaxNode(taxon));
343 int index = tree.size() - 1;
345 tree[index].parent = currentNode;
346 tree[index].level = (level+1);
347 tree[index].total = 1;
348 tree[currentNode].children[taxon] = index;
350 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
351 if (itGroup->second == true) {
352 tree[index].groupCount[itGroup->first]++;
358 }else{ //otherwise, error
359 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
366 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
367 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
372 catch(exception& e) {
373 m->errorOut(e, "PhyloSummary", "addSeqToTree");
378 /**************************************************************************************************/
380 void PhyloSummary::assignRank(int index){
382 map<string,int>::iterator it;
385 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
386 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
388 assignRank(it->second);
391 catch(exception& e) {
392 m->errorOut(e, "PhyloSummary", "assignRank");
396 /**************************************************************************************************/
398 void PhyloSummary::print(ofstream& out){
401 if (ignore) { assignRank(0); }
402 vector<string> mGroups;
404 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
405 if (groupmap != NULL) {
406 //so the labels match the counts below, since the map sorts them automatically...
407 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
408 mGroups = groupmap->getNamesOfGroups();
409 for (int i = 0; i < mGroups.size(); i++) {
410 out << mGroups[i] << '\t';
412 }else if (ct != NULL) {
413 if (ct->hasGroupInfo()) {
414 mGroups = ct->getNamesOfGroups();
415 for (int i = 0; i < mGroups.size(); i++) {
416 out << mGroups[i] << '\t';
422 int totalChildrenInTree = 0;
423 map<string, int>::iterator itGroup;
424 map<string,int>::iterator it;
425 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
426 if (tree[it->second].total != 0) {
427 totalChildrenInTree++;
428 tree[0].total += tree[it->second].total;
430 if (groupmap != NULL) {
431 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
432 }else if ( ct != NULL) {
433 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
440 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
441 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << (tree[0].total/(double) tree[0].total) << '\t';
443 if (groupmap != NULL) {
444 for (int i = 0; i < mGroups.size(); i++) {
445 double thisNum = tree[0].groupCount[mGroups[i]];
446 thisNum /= (double) groupmap->getNumSeqs(mGroups[i]);
447 out << thisNum << '\t';
449 }else if ( ct != NULL) {
450 if (ct->hasGroupInfo()) {
451 for (int i = 0; i < mGroups.size(); i++) {
452 double thisNum = tree[0].groupCount[mGroups[i]];
453 thisNum /= (double) ct->getGroupCount(mGroups[i]);
454 out << thisNum << '\t';
460 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
461 if (groupmap != NULL) {
462 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
463 }else if ( ct != NULL) {
464 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } }
473 catch(exception& e) {
474 m->errorOut(e, "PhyloSummary", "print");
478 /**************************************************************************************************/
480 void PhyloSummary::print(ofstream& out, bool relabund){
483 if (ignore) { assignRank(0); }
485 int totalChildrenInTree = 0;
486 map<string, int>::iterator itGroup;
488 map<string,int>::iterator it;
489 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
490 if (tree[it->second].total != 0) {
491 totalChildrenInTree++;
492 tree[0].total += tree[it->second].total;
494 if (groupmap != NULL) {
495 vector<string> mGroups = groupmap->getNamesOfGroups();
496 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
497 }else if ( ct != NULL) {
498 vector<string> mGroups = ct->getNamesOfGroups();
499 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
505 out << tree[0].name << "\t" << "1.0000" << "\t"; //root relative abundance is 1, everyone classifies to root
508 if (groupmap != NULL) {
509 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
510 }else if ( ct != NULL) {
511 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } }
514 if (groupmap != NULL) {
515 vector<string> mGroups = groupmap->getNamesOfGroups();
516 for (int i = 0; i < mGroups.size(); i++) { out << "1.0000" << '\t'; }
517 }else if ( ct != NULL) {
518 vector<string> mGroups = ct->getNamesOfGroups();
519 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << "1.0000" << '\t'; } }
525 print(0, out, relabund);
528 catch(exception& e) {
529 m->errorOut(e, "PhyloSummary", "print");
533 /**************************************************************************************************/
535 void PhyloSummary::print(int i, ofstream& out){
537 map<string,int>::iterator it;
538 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
540 if (tree[it->second].total != 0) {
542 int totalChildrenInTree = 0;
544 map<string,int>::iterator it2;
545 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
546 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
550 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << (tree[it->second].total/(double) tree[0].total) << "\t";
552 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
556 map<string, int>::iterator itGroup;
557 if (groupmap != NULL) {
558 vector<string> mGroups = groupmap->getNamesOfGroups();
559 for (int i = 0; i < mGroups.size(); i++) { out << (tree[it->second].groupCount[mGroups[i]]/(double)groupmap->getNumSeqs(mGroups[i])) << '\t'; }
560 }else if (ct != NULL) {
561 if (ct->hasGroupInfo()) {
562 vector<string> mGroups = ct->getNamesOfGroups();
563 for (int i = 0; i < mGroups.size(); i++) { out << (tree[it->second].groupCount[mGroups[i]]/(double)ct->getGroupCount(mGroups[i])) << '\t'; }
567 map<string, int>::iterator itGroup;
568 if (groupmap != NULL) {
569 vector<string> mGroups = groupmap->getNamesOfGroups();
570 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
571 }else if (ct != NULL) {
572 if (ct->hasGroupInfo()) {
573 vector<string> mGroups = ct->getNamesOfGroups();
574 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
583 print(it->second, out);
586 catch(exception& e) {
587 m->errorOut(e, "PhyloSummary", "print");
592 /**************************************************************************************************/
594 void PhyloSummary::print(int i, ofstream& out, bool relabund){
596 map<string,int>::iterator it;
597 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
599 if (tree[it->second].total != 0) {
601 int totalChildrenInTree = 0;
603 map<string,int>::iterator it2;
604 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
605 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
608 string nodeName = "";
609 int thisNode = it->second;
610 while (tree[thisNode].rank != "0") { //while you are not at top
611 if (m->control_pressed) { break; }
612 nodeName = tree[thisNode].name + "|" + nodeName;
613 thisNode = tree[thisNode].parent;
615 if (nodeName != "") { nodeName = nodeName.substr(0, nodeName.length()-1); }
617 out << nodeName << "\t" << (tree[it->second].total / (float)tree[i].total) << "\t";
619 map<string, int>::iterator itGroup;
620 if (groupmap != NULL) {
621 vector<string> mGroups = groupmap->getNamesOfGroups();
622 for (int j = 0; j < mGroups.size(); j++) {
623 if (tree[i].groupCount[mGroups[j]] == 0) {
625 }else { out << (tree[it->second].groupCount[mGroups[j]] / (float)tree[i].groupCount[mGroups[j]]) << '\t'; }
627 }else if (ct != NULL) {
628 if (ct->hasGroupInfo()) {
629 vector<string> mGroups = ct->getNamesOfGroups();
630 for (int j = 0; j < mGroups.size(); j++) {
631 if (tree[i].groupCount[mGroups[j]] == 0) {
633 }else { out << (tree[it->second].groupCount[mGroups[j]] / (float)tree[i].groupCount[mGroups[j]]) << '\t'; }
641 print(it->second, out, relabund);
644 catch(exception& e) {
645 m->errorOut(e, "PhyloSummary", "print");
649 /**************************************************************************************************/
650 void PhyloSummary::readTreeStruct(ifstream& in){
654 string line = m->getline(in); m->gobble(in);
658 in >> num; m->gobble(in);
662 in >> maxLevel; m->gobble(in);
665 for (int i = 0; i < tree.size(); i++) {
667 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
672 for (int j = 0; j < num; j++) {
673 in >> childName >> childIndex;
674 tree[i].children[childName] = childIndex;
677 //initialize groupcounts
678 if (groupmap != NULL) {
679 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
680 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
682 }else if (ct != NULL) {
683 if (ct->hasGroupInfo()) {
684 for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
685 tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
694 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
698 catch(exception& e) {
699 m->errorOut(e, "PhyloSummary", "readTreeStruct");
703 /**************************************************************************************************/