2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
20 if (groupFile != "") {
21 groupmap = new GroupMap(groupFile);
27 //check for necessary files
28 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
29 ifstream FileTest(taxFileNameTest.c_str());
32 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
34 readTreeStruct(FileTest);
42 m->errorOut(e, "PhyloSummary", "PhyloSummary");
47 /**************************************************************************************************/
49 PhyloSummary::PhyloSummary(string groupFile){
51 m = MothurOut::getInstance();
55 if (groupFile != "") {
56 groupmap = new GroupMap(groupFile);
62 tree.push_back(rawTaxNode("Root"));
68 m->errorOut(e, "PhyloSummary", "PhyloSummary");
72 /**************************************************************************************************/
74 int PhyloSummary::summarize(string userTfile){
76 map<string, string> temp;
77 m->readTax(userTfile, temp);
79 for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
80 addSeqToTree(itTemp->first, itTemp->second);
88 m->errorOut(e, "PhyloSummary", "summarize");
93 /**************************************************************************************************/
95 string PhyloSummary::getNextTaxon(string& heirarchy){
97 string currentLevel = "";
99 int pos = heirarchy.find_first_of(';');
100 currentLevel=heirarchy.substr(0,pos);
101 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
102 else { heirarchy = ""; }
106 catch(exception& e) {
107 m->errorOut(e, "PhyloSummary", "getNextTaxon");
112 /**************************************************************************************************/
114 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
119 map<string, int>::iterator childPointer;
126 //are there confidence scores, if so remove them
127 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
129 while (seqTaxonomy != "") {
131 if (m->control_pressed) { return 0; }
133 //somehow the parent is getting one too many accnos
134 //use print to reassign the taxa id
135 taxon = getNextTaxon(seqTaxonomy);
137 childPointer = tree[currentNode].children.find(taxon);
139 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
140 if (groupmap != NULL) {
141 //find out the sequences group
142 string group = groupmap->getGroup(seqName);
144 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
146 //do you have a count for this group?
147 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
149 //if yes, increment it - there should not be a case where we can't find it since we load group in read
150 if (itGroup != tree[childPointer->second].groupCount.end()) {
151 tree[childPointer->second].groupCount[group]++;
155 tree[childPointer->second].total++;
157 currentNode = childPointer->second;
161 tree.push_back(rawTaxNode(taxon));
162 int index = tree.size() - 1;
164 tree[index].parent = currentNode;
165 tree[index].level = (level+1);
166 tree[index].total = 1;
167 tree[currentNode].children[taxon] = index;
169 //initialize groupcounts
170 if (groupmap != NULL) {
171 vector<string> mGroups = groupmap->getNamesOfGroups();
172 for (int j = 0; j < mGroups.size(); j++) {
173 tree[index].groupCount[mGroups[j]] = 0;
176 //find out the sequences group
177 string group = groupmap->getGroup(seqName);
179 if (group == "not found") { m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
181 //do you have a count for this group?
182 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
184 //if yes, increment it - there should not be a case where we can't find it since we load group in read
185 if (itGroup != tree[index].groupCount.end()) {
186 tree[index].groupCount[group]++;
192 }else{ //otherwise, error
193 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
200 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
201 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
206 catch(exception& e) {
207 m->errorOut(e, "PhyloSummary", "addSeqToTree");
211 /**************************************************************************************************/
213 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
217 map<string, int>::iterator childPointer;
224 //are there confidence scores, if so remove them
225 if (seqTaxonomy.find_first_of('(') != -1) { m->removeConfidences(seqTaxonomy); }
227 while (seqTaxonomy != "") {
229 if (m->control_pressed) { return 0; }
231 //somehow the parent is getting one too many accnos
232 //use print to reassign the taxa id
233 taxon = getNextTaxon(seqTaxonomy);
235 childPointer = tree[currentNode].children.find(taxon);
237 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
238 if (groupmap != NULL) {
240 map<string, bool> containsGroup;
241 vector<string> mGroups = groupmap->getNamesOfGroups();
242 for (int j = 0; j < mGroups.size(); j++) {
243 containsGroup[mGroups[j]] = false;
246 for (int k = 0; k < names.size(); k++) {
247 //find out the sequences group
248 string group = groupmap->getGroup(names[k]);
250 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
252 containsGroup[group] = true;
256 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
257 if (itGroup->second == true) {
258 tree[childPointer->second].groupCount[itGroup->first]++;
264 tree[childPointer->second].total++;
266 currentNode = childPointer->second;
270 tree.push_back(rawTaxNode(taxon));
271 int index = tree.size() - 1;
273 tree[index].parent = currentNode;
274 tree[index].level = (level+1);
275 tree[index].total = 1;
276 tree[currentNode].children[taxon] = index;
278 //initialize groupcounts
279 if (groupmap != NULL) {
280 map<string, bool> containsGroup;
281 vector<string> mGroups = groupmap->getNamesOfGroups();
282 for (int j = 0; j < mGroups.size(); j++) {
283 tree[index].groupCount[mGroups[j]] = 0;
284 containsGroup[mGroups[j]] = false;
288 for (int k = 0; k < names.size(); k++) {
289 //find out the sequences group
290 string group = groupmap->getGroup(names[k]);
292 if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
294 containsGroup[group] = true;
298 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
299 if (itGroup->second == true) {
300 tree[index].groupCount[itGroup->first]++;
307 }else{ //otherwise, error
308 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
315 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
316 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
321 catch(exception& e) {
322 m->errorOut(e, "PhyloSummary", "addSeqToTree");
327 /**************************************************************************************************/
329 void PhyloSummary::assignRank(int index){
331 map<string,int>::iterator it;
334 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
335 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
338 assignRank(it->second);
341 catch(exception& e) {
342 m->errorOut(e, "PhyloSummary", "assignRank");
346 /**************************************************************************************************/
348 void PhyloSummary::print(ofstream& out){
351 if (ignore) { assignRank(0); }
354 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
355 if (groupmap != NULL) {
356 //so the labels match the counts below, since the map sorts them automatically...
357 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
358 vector<string> mGroups = groupmap->getNamesOfGroups();
359 for (int i = 0; i < mGroups.size(); i++) {
360 out << mGroups[i] << '\t';
366 int totalChildrenInTree = 0;
367 map<string, int>::iterator itGroup;
369 map<string,int>::iterator it;
370 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
371 if (tree[it->second].total != 0) {
372 totalChildrenInTree++;
373 tree[0].total += tree[it->second].total;
375 if (groupmap != NULL) {
376 vector<string> mGroups = groupmap->getNamesOfGroups();
377 for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
383 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
386 if (groupmap != NULL) {
387 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
388 // out << itGroup->second << '\t';
390 vector<string> mGroups = groupmap->getNamesOfGroups();
391 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
399 catch(exception& e) {
400 m->errorOut(e, "PhyloSummary", "print");
405 /**************************************************************************************************/
407 void PhyloSummary::print(int i, ofstream& out){
409 map<string,int>::iterator it;
410 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
412 if (tree[it->second].total != 0) {
414 int totalChildrenInTree = 0;
416 map<string,int>::iterator it2;
417 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
418 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
421 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
423 map<string, int>::iterator itGroup;
424 if (groupmap != NULL) {
425 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
426 // out << itGroup->second << '\t';
428 vector<string> mGroups = groupmap->getNamesOfGroups();
429 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
435 print(it->second, out);
438 catch(exception& e) {
439 m->errorOut(e, "PhyloSummary", "print");
443 /**************************************************************************************************/
444 void PhyloSummary::readTreeStruct(ifstream& in){
448 string line = m->getline(in); m->gobble(in);
452 in >> num; m->gobble(in);
456 in >> maxLevel; m->gobble(in);
459 for (int i = 0; i < tree.size(); i++) {
461 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
466 for (int j = 0; j < num; j++) {
467 in >> childName >> childIndex;
468 tree[i].children[childName] = childIndex;
471 //initialize groupcounts
472 if (groupmap != NULL) {
473 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
474 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
482 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
486 catch(exception& e) {
487 m->errorOut(e, "PhyloSummary", "readTreeStruct");
491 /**************************************************************************************************/