2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
20 if (groupFile != "") {
21 groupmap = new GroupMap(groupFile);
27 //check for necessary files
28 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29 ifstream FileTest(taxFileNameTest.c_str());
32 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
34 readTreeStruct(FileTest);
42 m->errorOut(e, "PhyloSummary", "PhyloSummary");
47 /**************************************************************************************************/
49 PhyloSummary::PhyloSummary(string groupFile){
51 m = MothurOut::getInstance();
55 if (groupFile != "") {
56 groupmap = new GroupMap(groupFile);
62 tree.push_back(rawTaxNode("Root"));
68 m->errorOut(e, "PhyloSummary", "PhyloSummary");
72 /**************************************************************************************************/
74 int PhyloSummary::summarize(string userTfile){
78 m->openInputFile(userTfile, in);
80 //read in users taxonomy file and add sequences to tree
84 in >> name >> tax; m->gobble(in);
86 addSeqToTree(name, tax);
89 if (m->control_pressed) { break; }
96 m->errorOut(e, "PhyloSummary", "summarize");
101 /**************************************************************************************************/
103 string PhyloSummary::getNextTaxon(string& heirarchy){
105 string currentLevel = "";
107 int pos = heirarchy.find_first_of(';');
108 currentLevel=heirarchy.substr(0,pos);
109 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
110 else { heirarchy = ""; }
114 catch(exception& e) {
115 m->errorOut(e, "PhyloSummary", "getNextTaxon");
120 /**************************************************************************************************/
122 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
126 map<string, int>::iterator childPointer;
133 //are there confidence scores, if so remove them
134 if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); }
136 while (seqTaxonomy != "") {
138 if (m->control_pressed) { return 0; }
140 //somehow the parent is getting one too many accnos
141 //use print to reassign the taxa id
142 taxon = getNextTaxon(seqTaxonomy);
144 childPointer = tree[currentNode].children.find(taxon);
146 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
147 if (groupmap != NULL) {
148 //find out the sequences group
149 string group = groupmap->getGroup(seqName);
151 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
153 //do you have a count for this group?
154 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
156 //if yes, increment it - there should not be a case where we can't find it since we load group in read
157 if (itGroup != tree[childPointer->second].groupCount.end()) {
158 tree[childPointer->second].groupCount[group]++;
162 tree[childPointer->second].total++;
164 currentNode = childPointer->second;
168 tree.push_back(rawTaxNode(taxon));
169 int index = tree.size() - 1;
171 tree[index].parent = currentNode;
172 tree[index].level = (level+1);
173 tree[index].total = 1;
174 tree[currentNode].children[taxon] = index;
176 //initialize groupcounts
177 if (groupmap != NULL) {
178 vector<string> mGroups = groupmap->getNamesOfGroups();
179 for (int j = 0; j < mGroups.size(); j++) {
180 tree[index].groupCount[mGroups[j]] = 0;
183 //find out the sequences group
184 string group = groupmap->getGroup(seqName);
186 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
188 //do you have a count for this group?
189 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
191 //if yes, increment it - there should not be a case where we can't find it since we load group in read
192 if (itGroup != tree[index].groupCount.end()) {
193 tree[index].groupCount[group]++;
199 }else{ //otherwise, error
200 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
207 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
208 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
213 catch(exception& e) {
214 m->errorOut(e, "PhyloSummary", "addSeqToTree");
218 /**************************************************************************************************/
220 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
224 map<string, int>::iterator childPointer;
231 //are there confidence scores, if so remove them
232 if (seqTaxonomy.find_first_of('(') != -1) { removeConfidences(seqTaxonomy); }
234 while (seqTaxonomy != "") {
236 if (m->control_pressed) { return 0; }
238 //somehow the parent is getting one too many accnos
239 //use print to reassign the taxa id
240 taxon = getNextTaxon(seqTaxonomy);
242 childPointer = tree[currentNode].children.find(taxon);
244 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
245 if (groupmap != NULL) {
247 map<string, bool> containsGroup;
248 vector<string> mGroups = groupmap->getNamesOfGroups();
249 for (int j = 0; j < mGroups.size(); j++) {
250 containsGroup[mGroups[j]] = false;
253 for (int k = 0; k < names.size(); k++) {
254 //find out the sequences group
255 string group = groupmap->getGroup(names[k]);
257 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
259 containsGroup[group] = true;
263 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
264 if (itGroup->second == true) {
265 tree[childPointer->second].groupCount[itGroup->first]++;
271 tree[childPointer->second].total++;
273 currentNode = childPointer->second;
277 tree.push_back(rawTaxNode(taxon));
278 int index = tree.size() - 1;
280 tree[index].parent = currentNode;
281 tree[index].level = (level+1);
282 tree[index].total = 1;
283 tree[currentNode].children[taxon] = index;
285 //initialize groupcounts
286 if (groupmap != NULL) {
287 map<string, bool> containsGroup;
288 vector<string> mGroups = groupmap->getNamesOfGroups();
289 for (int j = 0; j < mGroups.size(); j++) {
290 tree[index].groupCount[mGroups[j]] = 0;
291 containsGroup[mGroups[j]] = false;
295 for (int k = 0; k < names.size(); k++) {
296 //find out the sequences group
297 string group = groupmap->getGroup(names[k]);
299 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
301 containsGroup[group] = true;
305 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
306 if (itGroup->second == true) {
307 tree[index].groupCount[itGroup->first]++;
314 }else{ //otherwise, error
315 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
322 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
323 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
328 catch(exception& e) {
329 m->errorOut(e, "PhyloSummary", "addSeqToTree");
334 /**************************************************************************************************/
336 void PhyloSummary::assignRank(int index){
338 map<string,int>::iterator it;
341 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
342 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
345 assignRank(it->second);
348 catch(exception& e) {
349 m->errorOut(e, "PhyloSummary", "assignRank");
353 /**************************************************************************************************/
355 void PhyloSummary::print(ofstream& out){
358 if (ignore) { assignRank(0); }
361 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
362 if (groupmap != NULL) {
363 //so the labels match the counts below, since the map sorts them automatically...
364 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
365 vector<string> mGroups = groupmap->getNamesOfGroups();
366 for (int i = 0; i < mGroups.size(); i++) {
367 out << mGroups[i] << '\t';
373 int totalChildrenInTree = 0;
374 map<string, int>::iterator itGroup;
376 map<string,int>::iterator it;
377 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
378 if (tree[it->second].total != 0) {
379 totalChildrenInTree++;
380 tree[0].total += tree[it->second].total;
382 if (groupmap != NULL) {
383 vector<string> mGroups = groupmap->getNamesOfGroups();
384 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
390 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
393 if (groupmap != NULL) {
394 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
395 // out << itGroup->second << '\t';
397 vector<string> mGroups = groupmap->getNamesOfGroups();
398 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
406 catch(exception& e) {
407 m->errorOut(e, "PhyloSummary", "print");
412 /**************************************************************************************************/
414 void PhyloSummary::print(int i, ofstream& out){
416 map<string,int>::iterator it;
417 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
419 if (tree[it->second].total != 0) {
421 int totalChildrenInTree = 0;
423 map<string,int>::iterator it2;
424 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
425 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
428 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
430 map<string, int>::iterator itGroup;
431 if (groupmap != NULL) {
432 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
433 // out << itGroup->second << '\t';
435 vector<string> mGroups = groupmap->getNamesOfGroups();
436 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
442 print(it->second, out);
445 catch(exception& e) {
446 m->errorOut(e, "PhyloSummary", "print");
450 /**************************************************************************************************/
451 void PhyloSummary::readTreeStruct(ifstream& in){
455 string line = m->getline(in); m->gobble(in);
459 in >> num; m->gobble(in);
463 in >> maxLevel; m->gobble(in);
466 for (int i = 0; i < tree.size(); i++) {
468 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
473 for (int j = 0; j < num; j++) {
474 in >> childName >> childIndex;
475 tree[i].children[childName] = childIndex;
478 //initialize groupcounts
479 if (groupmap != NULL) {
480 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
481 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
489 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
493 catch(exception& e) {
494 m->errorOut(e, "PhyloSummary", "readTreeStruct");
498 /**************************************************************************************************/
499 void PhyloSummary::removeConfidences(string& tax) {
505 while (tax.find_first_of(';') != -1) {
507 taxon = tax.substr(0,tax.find_first_of(';'));
509 int pos = taxon.find_first_of('(');
511 taxon = taxon.substr(0, pos); //rip off confidence
516 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
522 catch(exception& e) {
523 m->errorOut(e, "PhyloSummary", "removeConfidences");
527 /**************************************************************************************************/