2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
20 if (groupFile != "") {
21 groupmap = new GroupMap(groupFile);
27 //check for necessary files
28 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
29 ifstream FileTest(taxFileNameTest.c_str());
32 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
34 readTreeStruct(FileTest);
42 m->errorOut(e, "PhyloSummary", "PhyloSummary");
47 /**************************************************************************************************/
49 PhyloSummary::PhyloSummary(string groupFile){
51 m = MothurOut::getInstance();
55 if (groupFile != "") {
56 groupmap = new GroupMap(groupFile);
62 tree.push_back(rawTaxNode("Root"));
68 m->errorOut(e, "PhyloSummary", "PhyloSummary");
72 /**************************************************************************************************/
74 int PhyloSummary::summarize(string userTfile){
78 m->openInputFile(userTfile, in);
80 //read in users taxonomy file and add sequences to tree
84 in >> name >> tax; m->gobble(in);
86 addSeqToTree(name, tax);
89 if (m->control_pressed) { break; }
96 m->errorOut(e, "PhyloSummary", "summarize");
101 /**************************************************************************************************/
103 string PhyloSummary::getNextTaxon(string& heirarchy){
105 string currentLevel = "";
107 int pos = heirarchy.find_first_of(';');
108 currentLevel=heirarchy.substr(0,pos);
109 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
110 else { heirarchy = ""; }
114 catch(exception& e) {
115 m->errorOut(e, "PhyloSummary", "getNextTaxon");
120 /**************************************************************************************************/
122 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
127 map<string, int>::iterator childPointer;
134 //are there confidence scores, if so remove them
135 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
137 while (seqTaxonomy != "") {
139 if (m->control_pressed) { return 0; }
141 //somehow the parent is getting one too many accnos
142 //use print to reassign the taxa id
143 taxon = getNextTaxon(seqTaxonomy);
145 childPointer = tree[currentNode].children.find(taxon);
147 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
148 if (groupmap != NULL) {
149 //find out the sequences group
150 string group = groupmap->getGroup(seqName);
152 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
154 //do you have a count for this group?
155 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
157 //if yes, increment it - there should not be a case where we can't find it since we load group in read
158 if (itGroup != tree[childPointer->second].groupCount.end()) {
159 tree[childPointer->second].groupCount[group]++;
163 tree[childPointer->second].total++;
165 currentNode = childPointer->second;
169 tree.push_back(rawTaxNode(taxon));
170 int index = tree.size() - 1;
172 tree[index].parent = currentNode;
173 tree[index].level = (level+1);
174 tree[index].total = 1;
175 tree[currentNode].children[taxon] = index;
177 //initialize groupcounts
178 if (groupmap != NULL) {
179 vector<string> mGroups = groupmap->getNamesOfGroups();
180 for (int j = 0; j < mGroups.size(); j++) {
181 tree[index].groupCount[mGroups[j]] = 0;
184 //find out the sequences group
185 string group = groupmap->getGroup(seqName);
187 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
189 //do you have a count for this group?
190 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
192 //if yes, increment it - there should not be a case where we can't find it since we load group in read
193 if (itGroup != tree[index].groupCount.end()) {
194 tree[index].groupCount[group]++;
200 }else{ //otherwise, error
201 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
208 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
209 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
214 catch(exception& e) {
215 m->errorOut(e, "PhyloSummary", "addSeqToTree");
219 /**************************************************************************************************/
221 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
225 map<string, int>::iterator childPointer;
232 //are there confidence scores, if so remove them
233 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
235 while (seqTaxonomy != "") {
237 if (m->control_pressed) { return 0; }
239 //somehow the parent is getting one too many accnos
240 //use print to reassign the taxa id
241 taxon = getNextTaxon(seqTaxonomy);
243 childPointer = tree[currentNode].children.find(taxon);
245 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
246 if (groupmap != NULL) {
248 map<string, bool> containsGroup;
249 vector<string> mGroups = groupmap->getNamesOfGroups();
250 for (int j = 0; j < mGroups.size(); j++) {
251 containsGroup[mGroups[j]] = false;
254 for (int k = 0; k < names.size(); k++) {
255 //find out the sequences group
256 string group = groupmap->getGroup(names[k]);
258 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
260 containsGroup[group] = true;
264 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
265 if (itGroup->second == true) {
266 tree[childPointer->second].groupCount[itGroup->first]++;
272 tree[childPointer->second].total++;
274 currentNode = childPointer->second;
278 tree.push_back(rawTaxNode(taxon));
279 int index = tree.size() - 1;
281 tree[index].parent = currentNode;
282 tree[index].level = (level+1);
283 tree[index].total = 1;
284 tree[currentNode].children[taxon] = index;
286 //initialize groupcounts
287 if (groupmap != NULL) {
288 map<string, bool> containsGroup;
289 vector<string> mGroups = groupmap->getNamesOfGroups();
290 for (int j = 0; j < mGroups.size(); j++) {
291 tree[index].groupCount[mGroups[j]] = 0;
292 containsGroup[mGroups[j]] = false;
296 for (int k = 0; k < names.size(); k++) {
297 //find out the sequences group
298 string group = groupmap->getGroup(names[k]);
300 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
302 containsGroup[group] = true;
306 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
307 if (itGroup->second == true) {
308 tree[index].groupCount[itGroup->first]++;
315 }else{ //otherwise, error
316 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
323 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
324 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
329 catch(exception& e) {
330 m->errorOut(e, "PhyloSummary", "addSeqToTree");
335 /**************************************************************************************************/
337 void PhyloSummary::assignRank(int index){
339 map<string,int>::iterator it;
342 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
343 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
346 assignRank(it->second);
349 catch(exception& e) {
350 m->errorOut(e, "PhyloSummary", "assignRank");
354 /**************************************************************************************************/
356 void PhyloSummary::print(ofstream& out){
359 if (ignore) { assignRank(0); }
362 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
363 if (groupmap != NULL) {
364 //so the labels match the counts below, since the map sorts them automatically...
365 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
366 vector<string> mGroups = groupmap->getNamesOfGroups();
367 for (int i = 0; i < mGroups.size(); i++) {
368 out << mGroups[i] << '\t';
374 int totalChildrenInTree = 0;
375 map<string, int>::iterator itGroup;
377 map<string,int>::iterator it;
378 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
379 if (tree[it->second].total != 0) {
380 totalChildrenInTree++;
381 tree[0].total += tree[it->second].total;
383 if (groupmap != NULL) {
384 vector<string> mGroups = groupmap->getNamesOfGroups();
385 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
391 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
394 if (groupmap != NULL) {
395 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
396 // out << itGroup->second << '\t';
398 vector<string> mGroups = groupmap->getNamesOfGroups();
399 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
407 catch(exception& e) {
408 m->errorOut(e, "PhyloSummary", "print");
413 /**************************************************************************************************/
415 void PhyloSummary::print(int i, ofstream& out){
417 map<string,int>::iterator it;
418 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
420 if (tree[it->second].total != 0) {
422 int totalChildrenInTree = 0;
424 map<string,int>::iterator it2;
425 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
426 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
429 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
431 map<string, int>::iterator itGroup;
432 if (groupmap != NULL) {
433 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
434 // out << itGroup->second << '\t';
436 vector<string> mGroups = groupmap->getNamesOfGroups();
437 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
443 print(it->second, out);
446 catch(exception& e) {
447 m->errorOut(e, "PhyloSummary", "print");
451 /**************************************************************************************************/
452 void PhyloSummary::readTreeStruct(ifstream& in){
456 string line = m->getline(in); m->gobble(in);
460 in >> num; m->gobble(in);
464 in >> maxLevel; m->gobble(in);
467 for (int i = 0; i < tree.size(); i++) {
469 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
474 for (int j = 0; j < num; j++) {
475 in >> childName >> childIndex;
476 tree[i].children[childName] = childIndex;
479 //initialize groupcounts
480 if (groupmap != NULL) {
481 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
482 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
490 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
494 catch(exception& e) {
495 m->errorOut(e, "PhyloSummary", "readTreeStruct");
499 /**************************************************************************************************/