2 * rawTrainingDataMaker.cpp
5 * Created by westcott on 4/21/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylosummary.h"
12 /**************************************************************************************************/
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
16 m = MothurOut::getInstance();
19 if (groupFile != "") {
20 groupmap = new GroupMap(groupFile);
26 //check for necessary files
27 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
28 ifstream FileTest(taxFileNameTest.c_str());
31 m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33 readTreeStruct(FileTest);
41 m->errorOut(e, "PhyloSummary", "PhyloSummary");
45 /**************************************************************************************************/
47 void PhyloSummary::summarize(string userTfile){
51 m->openInputFile(userTfile, in);
53 //read in users taxonomy file and add sequences to tree
56 in >> name >> tax; m->gobble(in);
58 addSeqToTree(name, tax);
60 if (m->control_pressed) { break; }
65 m->errorOut(e, "PhyloSummary", "summarize");
70 /**************************************************************************************************/
72 string PhyloSummary::getNextTaxon(string& heirarchy){
74 string currentLevel = "";
76 int pos = heirarchy.find_first_of(';');
77 currentLevel=heirarchy.substr(0,pos);
78 if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
79 else { heirarchy = ""; }
84 m->errorOut(e, "PhyloSummary", "getNextTaxon");
89 /**************************************************************************************************/
91 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
95 map<string, int>::iterator childPointer;
102 while (seqTaxonomy != "") {
104 if (m->control_pressed) { return 0; }
106 //somehow the parent is getting one too many accnos
107 //use print to reassign the taxa id
108 taxon = getNextTaxon(seqTaxonomy);
110 childPointer = tree[currentNode].children.find(taxon);
112 if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
113 if (groupmap != NULL) {
114 //find out the sequences group
115 string group = groupmap->getGroup(seqName);
117 if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); }
119 //do you have a count for this group?
120 map<string, int>::iterator itGroup = tree[currentNode].groupCount.find(group);
122 //if yes, increment it - there should not be a case where we can't find it since we load group in read
123 if (itGroup != tree[currentNode].groupCount.end()) {
124 tree[currentNode].groupCount[group]++;
128 tree[currentNode].total++;
130 currentNode = childPointer->second;
131 }else{ //otherwise, error
132 m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
138 if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
139 for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
144 catch(exception& e) {
145 m->errorOut(e, "PhyloSummary", "addSeqToTree");
149 /**************************************************************************************************/
151 void PhyloSummary::assignRank(int index){
153 map<string,int>::iterator it;
156 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
157 tree[it->second].rank = tree[index].rank + '.' + toString(counter);
160 assignRank(it->second);
163 catch(exception& e) {
164 m->errorOut(e, "PhyloSummary", "assignRank");
168 /**************************************************************************************************/
170 void PhyloSummary::print(ofstream& out){
173 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
174 if (groupmap != NULL) {
175 //so the labels match the counts below, since the map sorts them automatically...
176 //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
178 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
179 out << groupmap->namesOfGroups[i] << '\t';
185 int totalChildrenInTree = 0;
187 map<string,int>::iterator it;
188 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
189 if (tree[it->second].total != 0) { totalChildrenInTree++; }
193 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
195 map<string, int>::iterator itGroup;
196 if (groupmap != NULL) {
197 //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
198 // out << itGroup->second << '\t';
200 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; }
208 catch(exception& e) {
209 m->errorOut(e, "PhyloSummary", "print");
214 /**************************************************************************************************/
216 void PhyloSummary::print(int i, ofstream& out){
218 map<string,int>::iterator it;
219 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
221 if (tree[it->second].total != 0) {
223 int totalChildrenInTree = 0;
225 map<string,int>::iterator it2;
226 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
227 if (tree[it2->second].total != 0) { totalChildrenInTree++; }
230 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
232 map<string, int>::iterator itGroup;
233 if (groupmap != NULL) {
234 //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
235 // out << itGroup->second << '\t';
237 for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; }
242 print(it->second, out);
245 catch(exception& e) {
246 m->errorOut(e, "PhyloSummary", "print");
250 /**************************************************************************************************/
251 void PhyloSummary::readTreeStruct(ifstream& in){
255 string line = m->getline(in); m->gobble(in);
259 in >> num; m->gobble(in);
263 in >> maxLevel; m->gobble(in);
266 for (int i = 0; i < tree.size(); i++) {
268 in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
273 for (int j = 0; j < num; j++) {
274 in >> childName >> childIndex;
275 tree[i].children[childName] = childIndex;
278 //initialize groupcounts
279 if (groupmap != NULL) {
280 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
281 tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
289 //if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
293 catch(exception& e) {
294 m->errorOut(e, "PhyloSummary", "print");
299 /**************************************************************************************************/