2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
20 if (groupFile != "") {
21 groupmap = new GroupMap(groupFile);
27 //check for necessary files
28 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29 ifstream FileTest(taxFileNameTest.c_str());
32 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
34 readTreeStruct(FileTest);
42 m->errorOut(e, "PhyloSummary", "PhyloSummary");
47 /**************************************************************************************************/
49 PhyloSummary::PhyloSummary(string groupFile){
51 m = MothurOut::getInstance();
55 if (groupFile != "") {
56 groupmap = new GroupMap(groupFile);
62 tree.push_back(rawTaxNode("Root"));
68 m->errorOut(e, "PhyloSummary", "PhyloSummary");
72 /**************************************************************************************************/
74 void PhyloSummary::summarize(string userTfile){
78 m->openInputFile(userTfile, in);
80 //read in users taxonomy file and add sequences to tree
83 in >> name >> tax; m->gobble(in);
85 addSeqToTree(name, tax);
87 if (m->control_pressed) { break; }
92 m->errorOut(e, "PhyloSummary", "summarize");
97 /**************************************************************************************************/
99 string PhyloSummary::getNextTaxon(string& heirarchy){
101 string currentLevel = "";
103 int pos = heirarchy.find_first_of(';');
104 currentLevel=heirarchy.substr(0,pos);
105 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
106 else { heirarchy = ""; }
110 catch(exception& e) {
111 m->errorOut(e, "PhyloSummary", "getNextTaxon");
116 /**************************************************************************************************/
118 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
122 map<string, int>::iterator childPointer;
129 while (seqTaxonomy != "") {
131 if (m->control_pressed) { return 0; }
133 //somehow the parent is getting one too many accnos
134 //use print to reassign the taxa id
135 taxon = getNextTaxon(seqTaxonomy);
137 childPointer = tree[currentNode].children.find(taxon);
139 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
140 if (groupmap != NULL) {
141 //find out the sequences group
142 string group = groupmap->getGroup(seqName);
144 if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
146 //do you have a count for this group?
147 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
149 //if yes, increment it - there should not be a case where we can't find it since we load group in read
150 if (itGroup != tree[childPointer->second].groupCount.end()) {
151 tree[childPointer->second].groupCount[group]++;
155 tree[childPointer->second].total++;
157 currentNode = childPointer->second;
161 tree.push_back(rawTaxNode(taxon));
162 int index = tree.size() - 1;
164 tree[index].parent = currentNode;
165 tree[index].level = (level+1);
166 tree[index].total = 1;
167 tree[currentNode].children[taxon] = index;
169 //initialize groupcounts
170 if (groupmap != NULL) {
171 vector<string> mGroups = groupmap->getNamesOfGroups();
172 for (int j = 0; j < mGroups.size(); j++) {
173 tree[index].groupCount[mGroups[j]] = 0;
176 //find out the sequences group
177 string group = groupmap->getGroup(seqName);
179 if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
181 //do you have a count for this group?
182 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
184 //if yes, increment it - there should not be a case where we can't find it since we load group in read
185 if (itGroup != tree[index].groupCount.end()) {
186 tree[index].groupCount[group]++;
192 }else{ //otherwise, error
193 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
200 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
201 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
206 catch(exception& e) {
207 m->errorOut(e, "PhyloSummary", "addSeqToTree");
211 /**************************************************************************************************/
213 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
217 map<string, int>::iterator childPointer;
224 while (seqTaxonomy != "") {
226 if (m->control_pressed) { return 0; }
228 //somehow the parent is getting one too many accnos
229 //use print to reassign the taxa id
230 taxon = getNextTaxon(seqTaxonomy);
232 childPointer = tree[currentNode].children.find(taxon);
234 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
235 if (groupmap != NULL) {
237 map<string, bool> containsGroup;
238 vector<string> mGroups = groupmap->getNamesOfGroups();
239 for (int j = 0; j < mGroups.size(); j++) {
240 containsGroup[mGroups[j]] = false;
243 for (int k = 0; k < names.size(); k++) {
244 //find out the sequences group
245 string group = groupmap->getGroup(names[k]);
247 if (group == "not found") { m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
249 containsGroup[group] = true;
253 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
254 if (itGroup->second == true) {
255 tree[childPointer->second].groupCount[itGroup->first]++;
261 tree[childPointer->second].total++;
263 currentNode = childPointer->second;
267 tree.push_back(rawTaxNode(taxon));
268 int index = tree.size() - 1;
270 tree[index].parent = currentNode;
271 tree[index].level = (level+1);
272 tree[index].total = 1;
273 tree[currentNode].children[taxon] = index;
275 //initialize groupcounts
276 if (groupmap != NULL) {
277 map<string, bool> containsGroup;
278 vector<string> mGroups = groupmap->getNamesOfGroups();
279 for (int j = 0; j < mGroups.size(); j++) {
280 tree[index].groupCount[mGroups[j]] = 0;
281 containsGroup[mGroups[j]] = false;
285 for (int k = 0; k < names.size(); k++) {
286 //find out the sequences group
287 string group = groupmap->getGroup(names[k]);
289 if (group == "not found") { m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
291 containsGroup[group] = true;
295 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
296 if (itGroup->second == true) {
297 tree[index].groupCount[itGroup->first]++;
304 }else{ //otherwise, error
305 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
312 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
313 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
318 catch(exception& e) {
319 m->errorOut(e, "PhyloSummary", "addSeqToTree");
324 /**************************************************************************************************/
326 void PhyloSummary::assignRank(int index){
328 map<string,int>::iterator it;
331 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
332 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
335 assignRank(it->second);
338 catch(exception& e) {
339 m->errorOut(e, "PhyloSummary", "assignRank");
343 /**************************************************************************************************/
345 void PhyloSummary::print(ofstream& out){
348 if (ignore) { assignRank(0); }
351 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
352 if (groupmap != NULL) {
353 //so the labels match the counts below, since the map sorts them automatically...
354 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
355 vector<string> mGroups = groupmap->getNamesOfGroups();
356 for (int i = 0; i < mGroups.size(); i++) {
357 out << mGroups[i] << '\t';
363 int totalChildrenInTree = 0;
365 map<string,int>::iterator it;
366 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
367 if (tree[it->second].total != 0) { totalChildrenInTree++; }
371 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
373 map<string, int>::iterator itGroup;
374 if (groupmap != NULL) {
375 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
376 // out << itGroup->second << '\t';
378 vector<string> mGroups = groupmap->getNamesOfGroups();
379 for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; }
387 catch(exception& e) {
388 m->errorOut(e, "PhyloSummary", "print");
393 /**************************************************************************************************/
395 void PhyloSummary::print(int i, ofstream& out){
397 map<string,int>::iterator it;
398 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
400 if (tree[it->second].total != 0) {
402 int totalChildrenInTree = 0;
404 map<string,int>::iterator it2;
405 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
406 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
409 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
411 map<string, int>::iterator itGroup;
412 if (groupmap != NULL) {
413 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
414 // out << itGroup->second << '\t';
416 vector<string> mGroups = groupmap->getNamesOfGroups();
417 for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
423 print(it->second, out);
426 catch(exception& e) {
427 m->errorOut(e, "PhyloSummary", "print");
431 /**************************************************************************************************/
432 void PhyloSummary::readTreeStruct(ifstream& in){
436 string line = m->getline(in); m->gobble(in);
440 in >> num; m->gobble(in);
444 in >> maxLevel; m->gobble(in);
447 for (int i = 0; i < tree.size(); i++) {
449 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
454 for (int j = 0; j < num; j++) {
455 in >> childName >> childIndex;
456 tree[i].children[childName] = childIndex;
459 //initialize groupcounts
460 if (groupmap != NULL) {
461 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
462 tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
470 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
474 catch(exception& e) {
475 m->errorOut(e, "PhyloSummary", "print");
480 /**************************************************************************************************/