2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
11 /**************************************************************************************************/
13 PhyloSummary::PhyloSummary(string refTfile, CountTable* c){
15 m = MothurOut::getInstance();
23 //check for necessary files
24 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
25 ifstream FileTest(taxFileNameTest.c_str());
28 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
30 readTreeStruct(FileTest);
38 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43 /**************************************************************************************************/
45 PhyloSummary::PhyloSummary(CountTable* c){
47 m = MothurOut::getInstance();
55 tree.push_back(rawTaxNode("Root"));
59 m->errorOut(e, "PhyloSummary", "PhyloSummary");
63 /**************************************************************************************************/
64 PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
66 m = MothurOut::getInstance();
74 //check for necessary files
75 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
76 ifstream FileTest(taxFileNameTest.c_str());
79 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
81 readTreeStruct(FileTest);
89 m->errorOut(e, "PhyloSummary", "PhyloSummary");
94 /**************************************************************************************************/
96 PhyloSummary::PhyloSummary(GroupMap* g){
98 m = MothurOut::getInstance();
106 tree.push_back(rawTaxNode("Root"));
109 catch(exception& e) {
110 m->errorOut(e, "PhyloSummary", "PhyloSummary");
114 /**************************************************************************************************/
116 int PhyloSummary::summarize(string userTfile){
118 map<string, string> temp;
119 m->readTax(userTfile, temp);
121 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
122 addSeqToTree(itTemp->first, itTemp->second);
123 temp.erase(itTemp++);
128 catch(exception& e) {
129 m->errorOut(e, "PhyloSummary", "summarize");
134 /**************************************************************************************************/
136 string PhyloSummary::getNextTaxon(string& heirarchy){
138 string currentLevel = "";
140 int pos = heirarchy.find_first_of(';');
141 currentLevel=heirarchy.substr(0,pos);
142 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
143 else { heirarchy = ""; }
147 catch(exception& e) {
148 m->errorOut(e, "PhyloSummary", "getNextTaxon");
153 /**************************************************************************************************/
155 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
160 map<string, int>::iterator childPointer;
167 //are there confidence scores, if so remove them
168 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
170 while (seqTaxonomy != "") {
172 if (m->control_pressed) { return 0; }
174 //somehow the parent is getting one too many accnos
175 //use print to reassign the taxa id
176 taxon = getNextTaxon(seqTaxonomy);
178 childPointer = tree[currentNode].children.find(taxon);
180 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
183 if (groupmap != NULL) {
184 //find out the sequences group
185 string group = groupmap->getGroup(seqName);
187 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
189 //do you have a count for this group?
190 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
192 //if yes, increment it - there should not be a case where we can't find it since we load group in read
193 if (itGroup != tree[childPointer->second].groupCount.end()) {
194 tree[childPointer->second].groupCount[group]++;
196 }else if (ct != NULL) {
197 if (ct->hasGroupInfo()) {
198 vector<int> groupCounts = ct->getGroupCounts(seqName);
199 vector<string> groups = ct->getNamesOfGroups();
200 for (int i = 0; i < groups.size(); i++) {
202 if (groupCounts[i] != 0) {
203 //do you have a count for this group?
204 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
206 //if yes, increment it - there should not be a case where we can't find it since we load group in read
207 if (itGroup != tree[childPointer->second].groupCount.end()) {
208 tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
213 thisCount = ct->getNumSeqs(seqName);
216 tree[childPointer->second].total += thisCount;
218 currentNode = childPointer->second;
222 tree.push_back(rawTaxNode(taxon));
223 int index = tree.size() - 1;
225 tree[index].parent = currentNode;
226 tree[index].level = (level+1);
227 tree[currentNode].children[taxon] = index;
230 //initialize groupcounts
231 if (groupmap != NULL) {
232 vector<string> mGroups = groupmap->getNamesOfGroups();
233 for (int j = 0; j < mGroups.size(); j++) {
234 tree[index].groupCount[mGroups[j]] = 0;
237 //find out the sequences group
238 string group = groupmap->getGroup(seqName);
240 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
242 //do you have a count for this group?
243 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
245 //if yes, increment it - there should not be a case where we can't find it since we load group in read
246 if (itGroup != tree[index].groupCount.end()) {
247 tree[index].groupCount[group]++;
249 }else if (ct != NULL) {
250 if (ct->hasGroupInfo()) {
251 vector<string> mGroups = ct->getNamesOfGroups();
252 for (int j = 0; j < mGroups.size(); j++) {
253 tree[index].groupCount[mGroups[j]] = 0;
255 vector<int> groupCounts = ct->getGroupCounts(seqName);
256 vector<string> groups = ct->getNamesOfGroups();
258 for (int i = 0; i < groups.size(); i++) {
259 if (groupCounts[i] != 0) {
261 //do you have a count for this group?
262 map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
264 //if yes, increment it - there should not be a case where we can't find it since we load group in read
265 if (itGroup != tree[index].groupCount.end()) {
266 tree[index].groupCount[groups[i]]+=groupCounts[i];
271 thisCount = ct->getNumSeqs(seqName);
274 tree[index].total = thisCount;
277 }else{ //otherwise, error
278 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
285 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
286 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
291 catch(exception& e) {
292 m->errorOut(e, "PhyloSummary", "addSeqToTree");
296 /**************************************************************************************************/
298 int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
302 map<string, int>::iterator childPointer;
309 //are there confidence scores, if so remove them
310 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
312 while (seqTaxonomy != "") {
314 if (m->control_pressed) { return 0; }
316 //somehow the parent is getting one too many accnos
317 //use print to reassign the taxa id
318 taxon = getNextTaxon(seqTaxonomy);
320 childPointer = tree[currentNode].children.find(taxon);
322 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
323 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
324 if (itGroup->second == true) {
325 tree[childPointer->second].groupCount[itGroup->first]++;
329 tree[childPointer->second].total++;
331 currentNode = childPointer->second;
335 tree.push_back(rawTaxNode(taxon));
336 int index = tree.size() - 1;
338 tree[index].parent = currentNode;
339 tree[index].level = (level+1);
340 tree[index].total = 1;
341 tree[currentNode].children[taxon] = index;
343 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
344 if (itGroup->second == true) {
345 tree[index].groupCount[itGroup->first]++;
351 }else{ //otherwise, error
352 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
359 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
360 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
365 catch(exception& e) {
366 m->errorOut(e, "PhyloSummary", "addSeqToTree");
371 /**************************************************************************************************/
373 void PhyloSummary::assignRank(int index){
375 map<string,int>::iterator it;
378 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
379 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
382 assignRank(it->second);
385 catch(exception& e) {
386 m->errorOut(e, "PhyloSummary", "assignRank");
390 /**************************************************************************************************/
392 void PhyloSummary::print(ofstream& out){
395 if (ignore) { assignRank(0); }
396 vector<string> mGroups;
398 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
399 if (groupmap != NULL) {
400 //so the labels match the counts below, since the map sorts them automatically...
401 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
402 mGroups = groupmap->getNamesOfGroups();
403 for (int i = 0; i < mGroups.size(); i++) {
404 out << mGroups[i] << '\t';
406 }else if (ct != NULL) {
407 if (ct->hasGroupInfo()) {
408 mGroups = ct->getNamesOfGroups();
409 for (int i = 0; i < mGroups.size(); i++) {
410 out << mGroups[i] << '\t';
417 int totalChildrenInTree = 0;
418 map<string, int>::iterator itGroup;
420 map<string,int>::iterator it;
421 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
422 if (tree[it->second].total != 0) {
423 totalChildrenInTree++;
424 tree[0].total += tree[it->second].total;
426 if (groupmap != NULL) {
427 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
428 }else if ( ct != NULL) {
429 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
435 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
438 if (groupmap != NULL) {
439 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
440 }else if ( ct != NULL) {
441 if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } }
449 catch(exception& e) {
450 m->errorOut(e, "PhyloSummary", "print");
455 /**************************************************************************************************/
457 void PhyloSummary::print(int i, ofstream& out){
459 map<string,int>::iterator it;
460 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
462 if (tree[it->second].total != 0) {
464 int totalChildrenInTree = 0;
466 map<string,int>::iterator it2;
467 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
468 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
471 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
473 map<string, int>::iterator itGroup;
474 if (groupmap != NULL) {
475 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
476 // out << itGroup->second << '\t';
478 vector<string> mGroups = groupmap->getNamesOfGroups();
479 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
480 }else if (ct != NULL) {
481 if (ct->hasGroupInfo()) {
482 vector<string> mGroups = ct->getNamesOfGroups();
483 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
490 print(it->second, out);
493 catch(exception& e) {
494 m->errorOut(e, "PhyloSummary", "print");
498 /**************************************************************************************************/
499 void PhyloSummary::readTreeStruct(ifstream& in){
503 string line = m->getline(in); m->gobble(in);
507 in >> num; m->gobble(in);
511 in >> maxLevel; m->gobble(in);
514 for (int i = 0; i < tree.size(); i++) {
516 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
521 for (int j = 0; j < num; j++) {
522 in >> childName >> childIndex;
523 tree[i].children[childName] = childIndex;
526 //initialize groupcounts
527 if (groupmap != NULL) {
528 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
529 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
531 }else if (ct != NULL) {
532 if (ct->hasGroupInfo()) {
533 for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
534 tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
543 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
547 catch(exception& e) {
548 m->errorOut(e, "PhyloSummary", "readTreeStruct");
552 /**************************************************************************************************/