]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
added summary.tax command and fixed bug with root level totals in tax.summary file
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19                 
20                 if (groupFile != "") {
21                         groupmap = new GroupMap(groupFile);
22                         groupmap->readMap();
23                 }else{
24                         groupmap = NULL;
25                 }
26                                 
27                 //check for necessary files
28                 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29                 ifstream FileTest(taxFileNameTest.c_str());
30                 
31                 if (!FileTest) { 
32                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33                 }else{
34                         readTreeStruct(FileTest);
35                 }
36                 
37                 tree[0].rank = "0";
38                 assignRank(0);
39
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43                 exit(1);
44         }
45 }
46
47 /**************************************************************************************************/
48
49 PhyloSummary::PhyloSummary(string groupFile){
50         try {
51                 m = MothurOut::getInstance();
52                 maxLevel = 0;
53                 ignore = true;
54                 
55                 if (groupFile != "") {
56                         groupmap = new GroupMap(groupFile);
57                         groupmap->readMap();
58                 }else{
59                         groupmap = NULL;
60                 }
61                 
62                 tree.push_back(rawTaxNode("Root"));
63                 tree[0].rank = "0";
64                 
65                 
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
69                 exit(1);
70         }
71 }
72 /**************************************************************************************************/
73
74 int PhyloSummary::summarize(string userTfile){
75         try {
76                 
77                 ifstream in;
78                 m->openInputFile(userTfile, in);
79                 
80                 //read in users taxonomy file and add sequences to tree
81                 string name, tax;
82                 int numSeqs = 0;
83                 while(!in.eof()){
84                         in >> name >> tax; m->gobble(in);
85                         
86                         addSeqToTree(name, tax);
87                         numSeqs++;
88                         
89                         if (m->control_pressed) { break;  }
90                 }
91                 in.close();
92                 
93                 return numSeqs;
94         }
95         catch(exception& e) {
96                 m->errorOut(e, "PhyloSummary", "summarize");
97                 exit(1);
98         }
99 }
100
101 /**************************************************************************************************/
102
103 string PhyloSummary::getNextTaxon(string& heirarchy){
104         try {
105                 string currentLevel = "";
106                 if(heirarchy != ""){
107                         int pos = heirarchy.find_first_of(';');
108                         currentLevel=heirarchy.substr(0,pos);
109                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
110                         else { heirarchy = ""; }
111                 }
112                 return currentLevel;
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
116                 exit(1);
117         }
118 }
119
120 /**************************************************************************************************/
121
122 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
123         try {
124                 numSeqs++;
125                 
126                 map<string, int>::iterator childPointer;
127                 
128                 int currentNode = 0;
129                 string taxon;
130                 
131                 int level = 0;
132                 
133                 //are there confidence scores, if so remove them
134                 if (seqTaxonomy.find_first_of('(') != -1) {  removeConfidences(seqTaxonomy);    }
135                 
136                 while (seqTaxonomy != "") {
137                         
138                         if (m->control_pressed) { return 0; }
139                         
140                         //somehow the parent is getting one too many accnos
141                         //use print to reassign the taxa id
142                         taxon = getNextTaxon(seqTaxonomy);
143                         
144                         childPointer = tree[currentNode].children.find(taxon);
145                         
146                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
147                                 if (groupmap != NULL) {
148                                         //find out the sequences group
149                                         string group = groupmap->getGroup(seqName);
150                                         
151                                         if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
152                                         
153                                         //do you have a count for this group?
154                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
155                                         
156                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
157                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
158                                                 tree[childPointer->second].groupCount[group]++;
159                                         }
160                                 }
161                                 
162                                 tree[childPointer->second].total++;
163
164                                 currentNode = childPointer->second;
165                         }else{  
166                                 if (ignore) {
167                                                 
168                                         tree.push_back(rawTaxNode(taxon));
169                                         int index = tree.size() - 1;
170                                 
171                                         tree[index].parent = currentNode;
172                                         tree[index].level = (level+1);
173                                         tree[index].total = 1;
174                                         tree[currentNode].children[taxon] = index;
175                                         
176                                         //initialize groupcounts
177                                         if (groupmap != NULL) {
178                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
179                                                 for (int j = 0; j < mGroups.size(); j++) {
180                                                         tree[index].groupCount[mGroups[j]] = 0;
181                                                 }
182                                                 
183                                                 //find out the sequences group
184                                                 string group = groupmap->getGroup(seqName);
185                                                 
186                                                 if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
187                                                 
188                                                 //do you have a count for this group?
189                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
190                                                 
191                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
192                                                 if (itGroup != tree[index].groupCount.end()) {
193                                                         tree[index].groupCount[group]++;
194                                                 }                                               
195                                         }
196                                         
197                                         currentNode = index;
198                                         
199                                 }else{ //otherwise, error
200                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
201                                         break;
202                                 }
203                         }
204                         
205                         level++;
206                         
207                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
208                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
209                         }
210                 }
211                 return 0;
212         }
213         catch(exception& e) {
214                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
215                 exit(1);
216         }
217 }
218 /**************************************************************************************************/
219
220 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
221         try {
222                 numSeqs++;
223                 
224                 map<string, int>::iterator childPointer;
225                 
226                 int currentNode = 0;
227                 string taxon;
228                 
229                 int level = 0;
230                 
231                 //are there confidence scores, if so remove them
232                 if (seqTaxonomy.find_first_of('(') != -1) {  removeConfidences(seqTaxonomy);    }
233                 
234                 while (seqTaxonomy != "") {
235                         
236                         if (m->control_pressed) { return 0; }
237                         
238                         //somehow the parent is getting one too many accnos
239                         //use print to reassign the taxa id
240                         taxon = getNextTaxon(seqTaxonomy);
241                         
242                         childPointer = tree[currentNode].children.find(taxon);
243                         
244                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
245                                 if (groupmap != NULL) {
246                                         
247                                         map<string, bool> containsGroup; 
248                                         vector<string> mGroups = groupmap->getNamesOfGroups();
249                                         for (int j = 0; j < mGroups.size(); j++) {
250                                                 containsGroup[mGroups[j]] = false;
251                                         }
252                                         
253                                         for (int k = 0; k < names.size(); k++) {
254                                                 //find out the sequences group
255                                                 string group = groupmap->getGroup(names[k]);
256                                         
257                                                 if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
258                                                 else {
259                                                         containsGroup[group] = true;
260                                                 }
261                                         }
262                                         
263                                         for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
264                                                 if (itGroup->second == true) {
265                                                         tree[childPointer->second].groupCount[itGroup->first]++;
266                                                 }
267                                         }
268                                         
269                                 }
270                                 
271                                 tree[childPointer->second].total++;
272                                 
273                                 currentNode = childPointer->second;
274                         }else{  
275                                 if (ignore) {
276                                         
277                                         tree.push_back(rawTaxNode(taxon));
278                                         int index = tree.size() - 1;
279                                         
280                                         tree[index].parent = currentNode;
281                                         tree[index].level = (level+1);
282                                         tree[index].total = 1;
283                                         tree[currentNode].children[taxon] = index;
284                                         
285                                         //initialize groupcounts
286                                         if (groupmap != NULL) {
287                                                 map<string, bool> containsGroup; 
288                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
289                                                 for (int j = 0; j < mGroups.size(); j++) {
290                                                         tree[index].groupCount[mGroups[j]] = 0;
291                                                         containsGroup[mGroups[j]] = false;
292                                                 }
293                                                 
294                                                 
295                                                 for (int k = 0; k < names.size(); k++) {
296                                                         //find out the sequences group
297                                                         string group = groupmap->getGroup(names[k]);
298                                                         
299                                                         if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
300                                                         else {
301                                                                 containsGroup[group] = true;
302                                                         }
303                                                 }
304                                                 
305                                                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
306                                                         if (itGroup->second == true) {
307                                                                 tree[index].groupCount[itGroup->first]++;
308                                                         }
309                                                 }
310                                         }
311                                         
312                                         currentNode = index;
313                                         
314                                 }else{ //otherwise, error
315                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
316                                         break;
317                                 }
318                         }
319                         
320                         level++;
321                         
322                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
323                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
324                         }
325                 }
326                 return 0;
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
330                 exit(1);
331         }
332 }
333
334 /**************************************************************************************************/
335
336 void PhyloSummary::assignRank(int index){
337         try {
338                 map<string,int>::iterator it;
339                 int counter = 1;
340                 
341                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
342                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
343                         counter++;
344                                                                         
345                         assignRank(it->second);
346                 }
347         }
348         catch(exception& e) {
349                 m->errorOut(e, "PhyloSummary", "assignRank");
350                 exit(1);
351         }
352 }
353 /**************************************************************************************************/
354
355 void PhyloSummary::print(ofstream& out){
356         try {
357                 
358                 if (ignore) { assignRank(0); }
359         
360                 //print labels
361                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
362                 if (groupmap != NULL) {
363                         //so the labels match the counts below, since the map sorts them automatically...
364                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
365                         vector<string> mGroups = groupmap->getNamesOfGroups();
366                         for (int i = 0; i < mGroups.size(); i++) {
367                                 out << mGroups[i] << '\t';
368                         }
369                 }
370                 
371                 out << endl;
372                 
373                 int totalChildrenInTree = 0;
374                 map<string, int>::iterator itGroup;
375                 
376                 map<string,int>::iterator it;
377                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
378                         if (tree[it->second].total != 0)  {   
379                                 totalChildrenInTree++; 
380                                 tree[0].total += tree[it->second].total;
381                                 
382                                 if (groupmap != NULL) {
383                                         vector<string> mGroups = groupmap->getNamesOfGroups();
384                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
385                                 }
386                         }
387                 }
388                 
389                 //print root
390                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
391                 
392                 
393                 if (groupmap != NULL) {
394                         //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
395                         //      out << itGroup->second << '\t';
396                         //}
397                         vector<string> mGroups = groupmap->getNamesOfGroups();
398                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } 
399                 }
400                 out << endl;
401                 
402                 //print rest
403                 print(0, out);
404                 
405         }
406         catch(exception& e) {
407                 m->errorOut(e, "PhyloSummary", "print");
408                 exit(1);
409         }
410 }
411
412 /**************************************************************************************************/
413
414 void PhyloSummary::print(int i, ofstream& out){
415         try {
416                 map<string,int>::iterator it;
417                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
418                         
419                         if (tree[it->second].total != 0)  {
420                         
421                                 int totalChildrenInTree = 0;
422                 
423                                 map<string,int>::iterator it2;
424                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
425                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
426                                 }
427                         
428                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
429                                 
430                                 map<string, int>::iterator itGroup;
431                                 if (groupmap != NULL) {
432                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
433                                         //      out << itGroup->second << '\t';
434                                         //}
435                                         vector<string> mGroups = groupmap->getNamesOfGroups();
436                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
437                                 }
438                                 out << endl;
439                                 
440                         }
441                         
442                         print(it->second, out);
443                 }
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "PhyloSummary", "print");
447                 exit(1);
448         }
449 }
450 /**************************************************************************************************/
451 void PhyloSummary::readTreeStruct(ifstream& in){
452         try {
453         
454                 //read version
455                 string line = m->getline(in); m->gobble(in);
456                 
457                 int num;
458                 
459                 in >> num; m->gobble(in);
460                 
461                 tree.resize(num);
462                 
463                 in >> maxLevel; m->gobble(in);
464         
465                 //read the tree file
466                 for (int i = 0; i < tree.size(); i++) {
467         
468                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
469                         
470                         //set children
471                         string childName;
472                         int childIndex;
473                         for (int j = 0; j < num; j++) {
474                                 in >> childName >> childIndex;
475                                 tree[i].children[childName] = childIndex;
476                         }
477                         
478                         //initialize groupcounts
479                         if (groupmap != NULL) {
480                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
481                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
482                                 }
483                         }
484                         
485                         tree[i].total = 0;
486                         
487                         m->gobble(in);
488                         
489                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
490                 }
491
492         }
493         catch(exception& e) {
494                 m->errorOut(e, "PhyloSummary", "readTreeStruct");
495                 exit(1);
496         }
497 }
498 /**************************************************************************************************/
499 void PhyloSummary::removeConfidences(string& tax) {
500         try {
501                 
502                 string taxon;
503                 string newTax = "";
504                 
505                 while (tax.find_first_of(';') != -1) {
506                         //get taxon
507                         taxon = tax.substr(0,tax.find_first_of(';'));
508                         
509                         int pos = taxon.find_first_of('(');
510                         if (pos != -1) {
511                                 taxon = taxon.substr(0, pos); //rip off confidence 
512                         }
513                         
514                         taxon += ";";
515                         
516                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
517                         newTax += taxon;
518                 }
519                 
520                 tax = newTax;
521         }
522         catch(exception& e) {
523                 m->errorOut(e, "PhyloSummary", "removeConfidences");
524                 exit(1);
525         }
526 }
527 /**************************************************************************************************/
528
529
530