]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
rewrote metastats command in c++, added mothurRemove function to handle ~ error....
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19                 
20                 if (groupFile != "") {
21                         groupmap = new GroupMap(groupFile);
22                         groupmap->readMap();
23                 }else{
24                         groupmap = NULL;
25                 }
26                                 
27                 //check for necessary files
28                 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29                 ifstream FileTest(taxFileNameTest.c_str());
30                 
31                 if (!FileTest) { 
32                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33                 }else{
34                         readTreeStruct(FileTest);
35                 }
36                 
37                 tree[0].rank = "0";
38                 assignRank(0);
39
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43                 exit(1);
44         }
45 }
46
47 /**************************************************************************************************/
48
49 PhyloSummary::PhyloSummary(string groupFile){
50         try {
51                 m = MothurOut::getInstance();
52                 maxLevel = 0;
53                 ignore = true;
54                 
55                 if (groupFile != "") {
56                         groupmap = new GroupMap(groupFile);
57                         groupmap->readMap();
58                 }else{
59                         groupmap = NULL;
60                 }
61                 
62                 tree.push_back(rawTaxNode("Root"));
63                 tree[0].rank = "0";
64                 
65                 
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
69                 exit(1);
70         }
71 }
72 /**************************************************************************************************/
73
74 void PhyloSummary::summarize(string userTfile){
75         try {
76                 
77                 ifstream in;
78                 m->openInputFile(userTfile, in);
79                 
80                 //read in users taxonomy file and add sequences to tree
81                 string name, tax;
82                 while(!in.eof()){
83                         in >> name >> tax; m->gobble(in);
84                         
85                         addSeqToTree(name, tax);
86                         
87                         if (m->control_pressed) { break;  }
88                 }
89                 in.close();
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "PhyloSummary", "summarize");
93                 exit(1);
94         }
95 }
96
97 /**************************************************************************************************/
98
99 string PhyloSummary::getNextTaxon(string& heirarchy){
100         try {
101                 string currentLevel = "";
102                 if(heirarchy != ""){
103                         int pos = heirarchy.find_first_of(';');
104                         currentLevel=heirarchy.substr(0,pos);
105                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
106                         else { heirarchy = ""; }
107                 }
108                 return currentLevel;
109         }
110         catch(exception& e) {
111                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
112                 exit(1);
113         }
114 }
115
116 /**************************************************************************************************/
117
118 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
119         try {
120                 numSeqs++;
121                 
122                 map<string, int>::iterator childPointer;
123                 
124                 int currentNode = 0;
125                 string taxon;
126                 
127                 int level = 0;
128                 
129                 while (seqTaxonomy != "") {
130                         
131                         if (m->control_pressed) { return 0; }
132                         
133                         //somehow the parent is getting one too many accnos
134                         //use print to reassign the taxa id
135                         taxon = getNextTaxon(seqTaxonomy);
136                         
137                         childPointer = tree[currentNode].children.find(taxon);
138                         
139                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
140                                 if (groupmap != NULL) {
141                                         //find out the sequences group
142                                         string group = groupmap->getGroup(seqName);
143                                         
144                                         if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
145                                         
146                                         //do you have a count for this group?
147                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
148                                         
149                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
150                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
151                                                 tree[childPointer->second].groupCount[group]++;
152                                         }
153                                 }
154                                 
155                                 tree[childPointer->second].total++;
156
157                                 currentNode = childPointer->second;
158                         }else{  
159                                 if (ignore) {
160                                                 
161                                         tree.push_back(rawTaxNode(taxon));
162                                         int index = tree.size() - 1;
163                                 
164                                         tree[index].parent = currentNode;
165                                         tree[index].level = (level+1);
166                                         tree[index].total = 1;
167                                         tree[currentNode].children[taxon] = index;
168                                         
169                                         //initialize groupcounts
170                                         if (groupmap != NULL) {
171                                                 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
172                                                         tree[index].groupCount[groupmap->namesOfGroups[j]] = 0;
173                                                 }
174                                                 
175                                                 //find out the sequences group
176                                                 string group = groupmap->getGroup(seqName);
177                                                 
178                                                 if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
179                                                 
180                                                 //do you have a count for this group?
181                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
182                                                 
183                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
184                                                 if (itGroup != tree[index].groupCount.end()) {
185                                                         tree[index].groupCount[group]++;
186                                                 }                                               
187                                         }
188                                         
189                                         currentNode = index;
190                                         
191                                 }else{ //otherwise, error
192                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
193                                         break;
194                                 }
195                         }
196                         
197                         level++;
198                         
199                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
200                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
201                         }
202                 }
203                 return 0;
204         }
205         catch(exception& e) {
206                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
207                 exit(1);
208         }
209 }
210 /**************************************************************************************************/
211
212 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
213         try {
214                 numSeqs++;
215                 
216                 map<string, int>::iterator childPointer;
217                 
218                 int currentNode = 0;
219                 string taxon;
220                 
221                 int level = 0;
222                 
223                 while (seqTaxonomy != "") {
224                         
225                         if (m->control_pressed) { return 0; }
226                         
227                         //somehow the parent is getting one too many accnos
228                         //use print to reassign the taxa id
229                         taxon = getNextTaxon(seqTaxonomy);
230                         
231                         childPointer = tree[currentNode].children.find(taxon);
232                         
233                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
234                                 if (groupmap != NULL) {
235                                         
236                                         map<string, bool> containsGroup; 
237                                         for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
238                                                 containsGroup[groupmap->namesOfGroups[j]] = false;
239                                         }
240                                         
241                                         for (int k = 0; k < names.size(); k++) {
242                                                 //find out the sequences group
243                                                 string group = groupmap->getGroup(names[k]);
244                                         
245                                                 if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
246                                                 else {
247                                                         containsGroup[group] = true;
248                                                 }
249                                         }
250                                         
251                                         for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
252                                                 if (itGroup->second == true) {
253                                                         tree[childPointer->second].groupCount[itGroup->first]++;
254                                                 }
255                                         }
256                                         
257                                 }
258                                 
259                                 tree[childPointer->second].total++;
260                                 
261                                 currentNode = childPointer->second;
262                         }else{  
263                                 if (ignore) {
264                                         
265                                         tree.push_back(rawTaxNode(taxon));
266                                         int index = tree.size() - 1;
267                                         
268                                         tree[index].parent = currentNode;
269                                         tree[index].level = (level+1);
270                                         tree[index].total = 1;
271                                         tree[currentNode].children[taxon] = index;
272                                         
273                                         //initialize groupcounts
274                                         if (groupmap != NULL) {
275                                                 map<string, bool> containsGroup; 
276                                                 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
277                                                         tree[index].groupCount[groupmap->namesOfGroups[j]] = 0;
278                                                         containsGroup[groupmap->namesOfGroups[j]] = false;
279                                                 }
280                                                 
281                                                 
282                                                 for (int k = 0; k < names.size(); k++) {
283                                                         //find out the sequences group
284                                                         string group = groupmap->getGroup(names[k]);
285                                                         
286                                                         if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
287                                                         else {
288                                                                 containsGroup[group] = true;
289                                                         }
290                                                 }
291                                                 
292                                                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
293                                                         if (itGroup->second == true) {
294                                                                 tree[index].groupCount[itGroup->first]++;
295                                                         }
296                                                 }
297                                         }
298                                         
299                                         currentNode = index;
300                                         
301                                 }else{ //otherwise, error
302                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
303                                         break;
304                                 }
305                         }
306                         
307                         level++;
308                         
309                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
310                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
311                         }
312                 }
313                 return 0;
314         }
315         catch(exception& e) {
316                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
317                 exit(1);
318         }
319 }
320
321 /**************************************************************************************************/
322
323 void PhyloSummary::assignRank(int index){
324         try {
325                 map<string,int>::iterator it;
326                 int counter = 1;
327                 
328                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
329                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
330                         counter++;
331                                                                         
332                         assignRank(it->second);
333                 }
334         }
335         catch(exception& e) {
336                 m->errorOut(e, "PhyloSummary", "assignRank");
337                 exit(1);
338         }
339 }
340 /**************************************************************************************************/
341
342 void PhyloSummary::print(ofstream& out){
343         try {
344                 
345                 if (ignore) { assignRank(0); }
346         
347                 //print labels
348                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
349                 if (groupmap != NULL) {
350                         //so the labels match the counts below, since the map sorts them automatically...
351                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
352                         
353                         for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
354                                 out << groupmap->namesOfGroups[i] << '\t';
355                         }
356                 }
357                 
358                 out << endl;
359                 
360                 int totalChildrenInTree = 0;
361                 
362                 map<string,int>::iterator it;
363                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
364                         if (tree[it->second].total != 0)  {   totalChildrenInTree++; }
365                 }
366                 
367                 //print root
368                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
369                 
370                 map<string, int>::iterator itGroup;
371                 if (groupmap != NULL) {
372                         //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
373                         //      out << itGroup->second << '\t';
374                         //}
375                         for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {  out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; } 
376                 }
377                 out << endl;
378                 
379                 //print rest
380                 print(0, out);
381                 
382         }
383         catch(exception& e) {
384                 m->errorOut(e, "PhyloSummary", "print");
385                 exit(1);
386         }
387 }
388
389 /**************************************************************************************************/
390
391 void PhyloSummary::print(int i, ofstream& out){
392         try {
393                 map<string,int>::iterator it;
394                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
395                         
396                         if (tree[it->second].total != 0)  {
397                         
398                                 int totalChildrenInTree = 0;
399                 
400                                 map<string,int>::iterator it2;
401                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
402                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
403                                 }
404                         
405                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
406                                 
407                                 map<string, int>::iterator itGroup;
408                                 if (groupmap != NULL) {
409                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
410                                         //      out << itGroup->second << '\t';
411                                         //}
412                                         for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {  out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; } 
413                                 }
414                                 out << endl;
415                                 
416                         }
417                         
418                         print(it->second, out);
419                 }
420         }
421         catch(exception& e) {
422                 m->errorOut(e, "PhyloSummary", "print");
423                 exit(1);
424         }
425 }
426 /**************************************************************************************************/
427 void PhyloSummary::readTreeStruct(ifstream& in){
428         try {
429         
430                 //read version
431                 string line = m->getline(in); m->gobble(in);
432                 
433                 int num;
434                 
435                 in >> num; m->gobble(in);
436                 
437                 tree.resize(num);
438                 
439                 in >> maxLevel; m->gobble(in);
440         
441                 //read the tree file
442                 for (int i = 0; i < tree.size(); i++) {
443         
444                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
445                         
446                         //set children
447                         string childName;
448                         int childIndex;
449                         for (int j = 0; j < num; j++) {
450                                 in >> childName >> childIndex;
451                                 tree[i].children[childName] = childIndex;
452                         }
453                         
454                         //initialize groupcounts
455                         if (groupmap != NULL) {
456                                 for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
457                                         tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
458                                 }
459                         }
460                         
461                         tree[i].total = 0;
462                         
463                         m->gobble(in);
464                         
465                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
466                 }
467
468         }
469         catch(exception& e) {
470                 m->errorOut(e, "PhyloSummary", "print");
471                 exit(1);
472         }
473 }
474
475 /**************************************************************************************************/
476
477
478