]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19                 
20                 if (groupFile != "") {
21                         groupmap = new GroupMap(groupFile);
22                         groupmap->readMap();
23                 }else{
24                         groupmap = NULL;
25                 }
26                                 
27                 //check for necessary files
28                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
29                 ifstream FileTest(taxFileNameTest.c_str());
30                 
31                 if (!FileTest) { 
32                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33                 }else{
34                         readTreeStruct(FileTest);
35                 }
36                 
37                 tree[0].rank = "0";
38                 assignRank(0);
39
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43                 exit(1);
44         }
45 }
46
47 /**************************************************************************************************/
48
49 PhyloSummary::PhyloSummary(string groupFile){
50         try {
51                 m = MothurOut::getInstance();
52                 maxLevel = 0;
53                 ignore = true;
54                 
55                 if (groupFile != "") {
56                         groupmap = new GroupMap(groupFile);
57                         groupmap->readMap();
58                 }else{
59                         groupmap = NULL;
60                 }
61                 
62                 tree.push_back(rawTaxNode("Root"));
63                 tree[0].rank = "0";
64                 
65                 
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
69                 exit(1);
70         }
71 }
72 /**************************************************************************************************/
73
74 int PhyloSummary::summarize(string userTfile){
75         try {
76                 map<string, string> temp;
77         m->readTax(userTfile, temp);
78         
79         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
80             addSeqToTree(itTemp->first, itTemp->second);
81                         numSeqs++;
82             temp.erase(itTemp++);
83         }
84         
85         return numSeqs;
86         }
87         catch(exception& e) {
88                 m->errorOut(e, "PhyloSummary", "summarize");
89                 exit(1);
90         }
91 }
92
93 /**************************************************************************************************/
94
95 string PhyloSummary::getNextTaxon(string& heirarchy){
96         try {
97                 string currentLevel = "";
98                 if(heirarchy != ""){
99                         int pos = heirarchy.find_first_of(';');
100                         currentLevel=heirarchy.substr(0,pos);
101                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
102                         else { heirarchy = ""; }
103                 }
104                 return currentLevel;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
108                 exit(1);
109         }
110 }
111
112 /**************************************************************************************************/
113
114 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
115         try {
116                                 
117                 numSeqs++;
118                 
119                 map<string, int>::iterator childPointer;
120                 
121                 int currentNode = 0;
122                 string taxon;
123                 
124                 int level = 0;
125                 
126                 //are there confidence scores, if so remove them
127                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
128                 
129                 while (seqTaxonomy != "") {
130                         
131                         if (m->control_pressed) { return 0; }
132                         
133                         //somehow the parent is getting one too many accnos
134                         //use print to reassign the taxa id
135                         taxon = getNextTaxon(seqTaxonomy);
136                         
137                         childPointer = tree[currentNode].children.find(taxon);
138                         
139                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
140                                 if (groupmap != NULL) {
141                                         //find out the sequences group
142                                         string group = groupmap->getGroup(seqName);
143                                         
144                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
145                                         
146                                         //do you have a count for this group?
147                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
148                                         
149                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
150                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
151                                                 tree[childPointer->second].groupCount[group]++;
152                                         }
153                                 }
154                                 
155                                 tree[childPointer->second].total++;
156
157                                 currentNode = childPointer->second;
158                         }else{  
159                                 if (ignore) {
160                                                 
161                                         tree.push_back(rawTaxNode(taxon));
162                                         int index = tree.size() - 1;
163                                 
164                                         tree[index].parent = currentNode;
165                                         tree[index].level = (level+1);
166                                         tree[index].total = 1;
167                                         tree[currentNode].children[taxon] = index;
168                                         
169                                         //initialize groupcounts
170                                         if (groupmap != NULL) {
171                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
172                                                 for (int j = 0; j < mGroups.size(); j++) {
173                                                         tree[index].groupCount[mGroups[j]] = 0;
174                                                 }
175                                                 
176                                                 //find out the sequences group
177                                                 string group = groupmap->getGroup(seqName);
178                                                 
179                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
180                                                 
181                                                 //do you have a count for this group?
182                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
183                                                 
184                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
185                                                 if (itGroup != tree[index].groupCount.end()) {
186                                                         tree[index].groupCount[group]++;
187                                                 }                                               
188                                         }
189                                         
190                                         currentNode = index;
191                                         
192                                 }else{ //otherwise, error
193                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
194                                         break;
195                                 }
196                         }
197                         
198                         level++;
199                         
200                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
201                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
202                         }
203                 }
204                 return 0;
205         }
206         catch(exception& e) {
207                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
208                 exit(1);
209         }
210 }
211 /**************************************************************************************************/
212
213 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
214         try {
215                 numSeqs++;
216                 
217                 map<string, int>::iterator childPointer;
218                 
219                 int currentNode = 0;
220                 string taxon;
221                 
222                 int level = 0;
223                 
224                 //are there confidence scores, if so remove them
225                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
226                 
227                 while (seqTaxonomy != "") {
228                         
229                         if (m->control_pressed) { return 0; }
230                         
231                         //somehow the parent is getting one too many accnos
232                         //use print to reassign the taxa id
233                         taxon = getNextTaxon(seqTaxonomy);
234                         
235                         childPointer = tree[currentNode].children.find(taxon);
236                         
237                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
238                                 if (groupmap != NULL) {
239                                         
240                                         map<string, bool> containsGroup; 
241                                         vector<string> mGroups = groupmap->getNamesOfGroups();
242                                         for (int j = 0; j < mGroups.size(); j++) {
243                                                 containsGroup[mGroups[j]] = false;
244                                         }
245                                         
246                                         for (int k = 0; k < names.size(); k++) {
247                                                 //find out the sequences group
248                                                 string group = groupmap->getGroup(names[k]);
249                                         
250                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
251                                                 else {
252                                                         containsGroup[group] = true;
253                                                 }
254                                         }
255                                         
256                                         for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
257                                                 if (itGroup->second == true) {
258                                                         tree[childPointer->second].groupCount[itGroup->first]++;
259                                                 }
260                                         }
261                                         
262                                 }
263                                 
264                                 tree[childPointer->second].total++;
265                                 
266                                 currentNode = childPointer->second;
267                         }else{  
268                                 if (ignore) {
269                                         
270                                         tree.push_back(rawTaxNode(taxon));
271                                         int index = tree.size() - 1;
272                                         
273                                         tree[index].parent = currentNode;
274                                         tree[index].level = (level+1);
275                                         tree[index].total = 1;
276                                         tree[currentNode].children[taxon] = index;
277                                         
278                                         //initialize groupcounts
279                                         if (groupmap != NULL) {
280                                                 map<string, bool> containsGroup; 
281                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
282                                                 for (int j = 0; j < mGroups.size(); j++) {
283                                                         tree[index].groupCount[mGroups[j]] = 0;
284                                                         containsGroup[mGroups[j]] = false;
285                                                 }
286                                                 
287                                                 
288                                                 for (int k = 0; k < names.size(); k++) {
289                                                         //find out the sequences group
290                                                         string group = groupmap->getGroup(names[k]);
291                                                         
292                                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
293                                                         else {
294                                                                 containsGroup[group] = true;
295                                                         }
296                                                 }
297                                                 
298                                                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
299                                                         if (itGroup->second == true) {
300                                                                 tree[index].groupCount[itGroup->first]++;
301                                                         }
302                                                 }
303                                         }
304                                         
305                                         currentNode = index;
306                                         
307                                 }else{ //otherwise, error
308                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
309                                         break;
310                                 }
311                         }
312                         
313                         level++;
314                         
315                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
316                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
317                         }
318                 }
319                 return 0;
320         }
321         catch(exception& e) {
322                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
323                 exit(1);
324         }
325 }
326
327 /**************************************************************************************************/
328
329 void PhyloSummary::assignRank(int index){
330         try {
331                 map<string,int>::iterator it;
332                 int counter = 1;
333                 
334                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
335                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
336                         counter++;
337                                                                         
338                         assignRank(it->second);
339                 }
340         }
341         catch(exception& e) {
342                 m->errorOut(e, "PhyloSummary", "assignRank");
343                 exit(1);
344         }
345 }
346 /**************************************************************************************************/
347
348 void PhyloSummary::print(ofstream& out){
349         try {
350                 
351                 if (ignore) { assignRank(0); }
352         
353                 //print labels
354                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
355                 if (groupmap != NULL) {
356                         //so the labels match the counts below, since the map sorts them automatically...
357                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
358                         vector<string> mGroups = groupmap->getNamesOfGroups();
359                         for (int i = 0; i < mGroups.size(); i++) {
360                                 out << mGroups[i] << '\t';
361                         }
362                 }
363                 
364                 out << endl;
365                 
366                 int totalChildrenInTree = 0;
367                 map<string, int>::iterator itGroup;
368                 
369                 map<string,int>::iterator it;
370                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
371                         if (tree[it->second].total != 0)  {   
372                                 totalChildrenInTree++; 
373                                 tree[0].total += tree[it->second].total;
374                                 
375                                 if (groupmap != NULL) {
376                                         vector<string> mGroups = groupmap->getNamesOfGroups();
377                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
378                                 }
379                         }
380                 }
381                 
382                 //print root
383                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
384                 
385                 
386                 if (groupmap != NULL) {
387                         //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
388                         //      out << itGroup->second << '\t';
389                         //}
390                         vector<string> mGroups = groupmap->getNamesOfGroups();
391                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } 
392                 }
393                 out << endl;
394                 
395                 //print rest
396                 print(0, out);
397                 
398         }
399         catch(exception& e) {
400                 m->errorOut(e, "PhyloSummary", "print");
401                 exit(1);
402         }
403 }
404
405 /**************************************************************************************************/
406
407 void PhyloSummary::print(int i, ofstream& out){
408         try {
409                 map<string,int>::iterator it;
410                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
411                         
412                         if (tree[it->second].total != 0)  {
413                         
414                                 int totalChildrenInTree = 0;
415                 
416                                 map<string,int>::iterator it2;
417                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
418                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
419                                 }
420                         
421                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
422                                 
423                                 map<string, int>::iterator itGroup;
424                                 if (groupmap != NULL) {
425                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
426                                         //      out << itGroup->second << '\t';
427                                         //}
428                                         vector<string> mGroups = groupmap->getNamesOfGroups();
429                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
430                                 }
431                                 out << endl;
432                                 
433                         }
434                         
435                         print(it->second, out);
436                 }
437         }
438         catch(exception& e) {
439                 m->errorOut(e, "PhyloSummary", "print");
440                 exit(1);
441         }
442 }
443 /**************************************************************************************************/
444 void PhyloSummary::readTreeStruct(ifstream& in){
445         try {
446         
447                 //read version
448                 string line = m->getline(in); m->gobble(in);
449                 
450                 int num;
451                 
452                 in >> num; m->gobble(in);
453                 
454                 tree.resize(num);
455                 
456                 in >> maxLevel; m->gobble(in);
457         
458                 //read the tree file
459                 for (int i = 0; i < tree.size(); i++) {
460         
461                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
462                         
463                         //set children
464                         string childName;
465                         int childIndex;
466                         for (int j = 0; j < num; j++) {
467                                 in >> childName >> childIndex;
468                                 tree[i].children[childName] = childIndex;
469                         }
470                         
471                         //initialize groupcounts
472                         if (groupmap != NULL) {
473                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
474                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
475                                 }
476                         }
477                         
478                         tree[i].total = 0;
479                         
480                         m->gobble(in);
481                         
482                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
483                 }
484
485         }
486         catch(exception& e) {
487                 m->errorOut(e, "PhyloSummary", "readTreeStruct");
488                 exit(1);
489         }
490 }
491 /**************************************************************************************************/
492
493
494