2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
20 if (groupFile != "") {
21 groupmap = new GroupMap(groupFile);
27 //check for necessary files
28 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29 ifstream FileTest(taxFileNameTest.c_str());
32 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
34 readTreeStruct(FileTest);
42 m->errorOut(e, "PhyloSummary", "PhyloSummary");
47 /**************************************************************************************************/
49 PhyloSummary::PhyloSummary(string groupFile){
51 m = MothurOut::getInstance();
55 if (groupFile != "") {
56 groupmap = new GroupMap(groupFile);
62 tree.push_back(rawTaxNode("Root"));
68 m->errorOut(e, "PhyloSummary", "PhyloSummary");
72 /**************************************************************************************************/
74 void PhyloSummary::summarize(string userTfile){
78 m->openInputFile(userTfile, in);
80 //read in users taxonomy file and add sequences to tree
83 in >> name >> tax; m->gobble(in);
85 addSeqToTree(name, tax);
87 if (m->control_pressed) { break; }
92 m->errorOut(e, "PhyloSummary", "summarize");
97 /**************************************************************************************************/
99 string PhyloSummary::getNextTaxon(string& heirarchy){
101 string currentLevel = "";
103 int pos = heirarchy.find_first_of(';');
104 currentLevel=heirarchy.substr(0,pos);
105 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
106 else { heirarchy = ""; }
110 catch(exception& e) {
111 m->errorOut(e, "PhyloSummary", "getNextTaxon");
116 /**************************************************************************************************/
118 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
122 map<string, int>::iterator childPointer;
129 while (seqTaxonomy != "") {
131 if (m->control_pressed) { return 0; }
133 //somehow the parent is getting one too many accnos
134 //use print to reassign the taxa id
135 taxon = getNextTaxon(seqTaxonomy);
137 childPointer = tree[currentNode].children.find(taxon);
139 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
140 if (groupmap != NULL) {
141 //find out the sequences group
142 string group = groupmap->getGroup(seqName);
144 if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
146 //do you have a count for this group?
147 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
149 //if yes, increment it - there should not be a case where we can't find it since we load group in read
150 if (itGroup != tree[childPointer->second].groupCount.end()) {
151 tree[childPointer->second].groupCount[group]++;
155 tree[childPointer->second].total++;
157 currentNode = childPointer->second;
161 tree.push_back(rawTaxNode(taxon));
162 int index = tree.size() - 1;
164 tree[index].parent = currentNode;
165 tree[index].level = (level+1);
166 tree[index].total = 1;
167 tree[currentNode].children[taxon] = index;
169 //initialize groupcounts
170 if (groupmap != NULL) {
171 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
172 tree[index].groupCount[groupmap->namesOfGroups[j]] = 0;
175 //find out the sequences group
176 string group = groupmap->getGroup(seqName);
178 if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
180 //do you have a count for this group?
181 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
183 //if yes, increment it - there should not be a case where we can't find it since we load group in read
184 if (itGroup != tree[index].groupCount.end()) {
185 tree[index].groupCount[group]++;
191 }else{ //otherwise, error
192 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
199 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
200 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
205 catch(exception& e) {
206 m->errorOut(e, "PhyloSummary", "addSeqToTree");
210 /**************************************************************************************************/
212 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
216 map<string, int>::iterator childPointer;
223 while (seqTaxonomy != "") {
225 if (m->control_pressed) { return 0; }
227 //somehow the parent is getting one too many accnos
228 //use print to reassign the taxa id
229 taxon = getNextTaxon(seqTaxonomy);
231 childPointer = tree[currentNode].children.find(taxon);
233 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
234 if (groupmap != NULL) {
236 map<string, bool> containsGroup;
237 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
238 containsGroup[groupmap->namesOfGroups[j]] = false;
241 for (int k = 0; k < names.size(); k++) {
242 //find out the sequences group
243 string group = groupmap->getGroup(names[k]);
245 if (group == "not found") { m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
247 containsGroup[group] = true;
251 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
252 if (itGroup->second == true) {
253 tree[childPointer->second].groupCount[itGroup->first]++;
259 tree[childPointer->second].total++;
261 currentNode = childPointer->second;
265 tree.push_back(rawTaxNode(taxon));
266 int index = tree.size() - 1;
268 tree[index].parent = currentNode;
269 tree[index].level = (level+1);
270 tree[index].total = 1;
271 tree[currentNode].children[taxon] = index;
273 //initialize groupcounts
274 if (groupmap != NULL) {
275 map<string, bool> containsGroup;
276 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
277 tree[index].groupCount[groupmap->namesOfGroups[j]] = 0;
278 containsGroup[groupmap->namesOfGroups[j]] = false;
282 for (int k = 0; k < names.size(); k++) {
283 //find out the sequences group
284 string group = groupmap->getGroup(names[k]);
286 if (group == "not found") { m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
288 containsGroup[group] = true;
292 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
293 if (itGroup->second == true) {
294 tree[index].groupCount[itGroup->first]++;
301 }else{ //otherwise, error
302 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
309 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
310 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
315 catch(exception& e) {
316 m->errorOut(e, "PhyloSummary", "addSeqToTree");
321 /**************************************************************************************************/
323 void PhyloSummary::assignRank(int index){
325 map<string,int>::iterator it;
328 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
329 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
332 assignRank(it->second);
335 catch(exception& e) {
336 m->errorOut(e, "PhyloSummary", "assignRank");
340 /**************************************************************************************************/
342 void PhyloSummary::print(ofstream& out){
345 if (ignore) { assignRank(0); }
348 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
349 if (groupmap != NULL) {
350 //so the labels match the counts below, since the map sorts them automatically...
351 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
353 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
354 out << groupmap->namesOfGroups[i] << '\t';
360 int totalChildrenInTree = 0;
362 map<string,int>::iterator it;
363 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
364 if (tree[it->second].total != 0) { totalChildrenInTree++; }
368 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
370 map<string, int>::iterator itGroup;
371 if (groupmap != NULL) {
372 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
373 // out << itGroup->second << '\t';
375 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; }
383 catch(exception& e) {
384 m->errorOut(e, "PhyloSummary", "print");
389 /**************************************************************************************************/
391 void PhyloSummary::print(int i, ofstream& out){
393 map<string,int>::iterator it;
394 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
396 if (tree[it->second].total != 0) {
398 int totalChildrenInTree = 0;
400 map<string,int>::iterator it2;
401 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
402 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
405 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
407 map<string, int>::iterator itGroup;
408 if (groupmap != NULL) {
409 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
410 // out << itGroup->second << '\t';
412 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; }
417 print(it->second, out);
420 catch(exception& e) {
421 m->errorOut(e, "PhyloSummary", "print");
425 /**************************************************************************************************/
426 void PhyloSummary::readTreeStruct(ifstream& in){
430 string line = m->getline(in); m->gobble(in);
434 in >> num; m->gobble(in);
438 in >> maxLevel; m->gobble(in);
441 for (int i = 0; i < tree.size(); i++) {
443 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
448 for (int j = 0; j < num; j++) {
449 in >> childName >> childIndex;
450 tree[i].children[childName] = childIndex;
453 //initialize groupcounts
454 if (groupmap != NULL) {
455 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
456 tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
464 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
468 catch(exception& e) {
469 m->errorOut(e, "PhyloSummary", "print");
474 /**************************************************************************************************/