]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11 /**************************************************************************************************/
12
13 PhyloSummary::PhyloSummary(string refTfile, CountTable* c){
14         try {
15                 m = MothurOut::getInstance();
16                 maxLevel = 0;
17                 ignore = false;
18         numSeqs = 0;
19                 
20                 ct = c;
21         groupmap = NULL;
22         
23                 //check for necessary files
24                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
25                 ifstream FileTest(taxFileNameTest.c_str());
26                 
27                 if (!FileTest) { 
28                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
29                 }else{
30                         readTreeStruct(FileTest);
31                 }
32                 
33                 tree[0].rank = "0";
34                 assignRank(0);
35         
36         }
37         catch(exception& e) {
38                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
39                 exit(1);
40         }
41 }
42
43 /**************************************************************************************************/
44
45 PhyloSummary::PhyloSummary(CountTable* c){
46         try {
47                 m = MothurOut::getInstance();
48                 maxLevel = 0;
49                 ignore = true;
50         numSeqs = 0;
51                 
52                 ct = c;
53         groupmap = NULL;
54                 
55                 tree.push_back(rawTaxNode("Root"));
56                 tree[0].rank = "0";
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
60                 exit(1);
61         }
62 }
63 /**************************************************************************************************/
64 PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
65         try {
66                 m = MothurOut::getInstance();
67                 maxLevel = 0;
68                 ignore = false;
69         numSeqs = 0;
70                 
71                 groupmap = g;
72         ct = NULL;
73                                 
74                 //check for necessary files
75                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
76                 ifstream FileTest(taxFileNameTest.c_str());
77                 
78                 if (!FileTest) { 
79                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
80                 }else{
81                         readTreeStruct(FileTest);
82                 }
83                 
84                 tree[0].rank = "0";
85                 assignRank(0);
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
90                 exit(1);
91         }
92 }
93
94 /**************************************************************************************************/
95
96 PhyloSummary::PhyloSummary(GroupMap* g){
97         try {
98                 m = MothurOut::getInstance();
99                 maxLevel = 0;
100                 ignore = true;
101         numSeqs = 0;
102                 
103                 groupmap = g;
104         ct = NULL;
105                 
106                 tree.push_back(rawTaxNode("Root"));
107                 tree[0].rank = "0";
108         }
109         catch(exception& e) {
110                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
111                 exit(1);
112         }
113 }
114 /**************************************************************************************************/
115
116 int PhyloSummary::summarize(string userTfile){
117         try {
118                 map<string, string> temp;
119         m->readTax(userTfile, temp);
120         
121         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
122             addSeqToTree(itTemp->first, itTemp->second);
123             temp.erase(itTemp++);
124         }
125         
126         return numSeqs;
127         }
128         catch(exception& e) {
129                 m->errorOut(e, "PhyloSummary", "summarize");
130                 exit(1);
131         }
132 }
133
134 /**************************************************************************************************/
135
136 string PhyloSummary::getNextTaxon(string& heirarchy){
137         try {
138                 string currentLevel = "";
139                 if(heirarchy != ""){
140                         int pos = heirarchy.find_first_of(';');
141                         currentLevel=heirarchy.substr(0,pos);
142                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
143                         else { heirarchy = ""; }
144                 }
145                 return currentLevel;
146         }
147         catch(exception& e) {
148                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
149                 exit(1);
150         }
151 }
152
153 /**************************************************************************************************/
154
155 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
156         try {
157                                 
158                 numSeqs++;
159                 
160                 map<string, int>::iterator childPointer;
161                 
162                 int currentNode = 0;
163                 string taxon;
164                 
165                 int level = 0;
166                 
167                 //are there confidence scores, if so remove them
168                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
169                 
170                 while (seqTaxonomy != "") {
171                         
172                         if (m->control_pressed) { return 0; }
173                         
174                         //somehow the parent is getting one too many accnos
175                         //use print to reassign the taxa id
176                         taxon = getNextTaxon(seqTaxonomy);
177                         
178                         childPointer = tree[currentNode].children.find(taxon);
179                         
180                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
181                                 int thisCount = 1;
182                 
183                 if (groupmap != NULL) {
184                                         //find out the sequences group
185                                         string group = groupmap->getGroup(seqName);
186                                         
187                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
188                                         
189                                         //do you have a count for this group?
190                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
191                                         
192                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
193                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
194                                                 tree[childPointer->second].groupCount[group]++;
195                                         }
196                                 }else if (ct != NULL) {
197                     if (ct->hasGroupInfo()) {
198                         vector<int> groupCounts = ct->getGroupCounts(seqName);
199                         vector<string> groups = ct->getNamesOfGroups();
200                         for (int i = 0; i < groups.size(); i++) {
201                             
202                             if (groupCounts[i] != 0) {
203                                 //do you have a count for this group?
204                                 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
205                                 
206                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
207                                 if (itGroup != tree[childPointer->second].groupCount.end()) {
208                                     tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
209                                 }
210                             }
211                         }
212                     }
213                     thisCount = ct->getNumSeqs(seqName);
214                 }
215                                 
216                                 tree[childPointer->second].total += thisCount;
217
218                                 currentNode = childPointer->second;
219                         }else{  
220                                 if (ignore) {
221                                                 
222                                         tree.push_back(rawTaxNode(taxon));
223                                         int index = tree.size() - 1;
224                                 
225                                         tree[index].parent = currentNode;
226                                         tree[index].level = (level+1);
227                                         tree[currentNode].children[taxon] = index;
228                     int thisCount = 1;
229                                         
230                                         //initialize groupcounts
231                                         if (groupmap != NULL) {
232                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
233                                                 for (int j = 0; j < mGroups.size(); j++) {
234                                                         tree[index].groupCount[mGroups[j]] = 0;
235                                                 }
236                                                 
237                                                 //find out the sequences group
238                                                 string group = groupmap->getGroup(seqName);
239                                                 
240                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
241                                                 
242                                                 //do you have a count for this group?
243                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
244                                                 
245                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
246                                                 if (itGroup != tree[index].groupCount.end()) {
247                                                         tree[index].groupCount[group]++;
248                                                 }
249                                         }else if (ct != NULL) {
250                         if (ct->hasGroupInfo()) {
251                             vector<string> mGroups = ct->getNamesOfGroups();
252                             for (int j = 0; j < mGroups.size(); j++) {
253                                 tree[index].groupCount[mGroups[j]] = 0;
254                             }
255                             vector<int> groupCounts = ct->getGroupCounts(seqName);
256                             vector<string> groups = ct->getNamesOfGroups();
257                         
258                             for (int i = 0; i < groups.size(); i++) {
259                                 if (groupCounts[i] != 0) {
260                                    
261                                     //do you have a count for this group?
262                                     map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
263                                      
264                                     //if yes, increment it - there should not be a case where we can't find it since we load group in read
265                                     if (itGroup != tree[index].groupCount.end()) {
266                                         tree[index].groupCount[groups[i]]+=groupCounts[i];
267                                     }
268                                 }
269                             }
270                         }
271                         thisCount = ct->getNumSeqs(seqName);
272                     }
273                                         
274                     tree[index].total = thisCount;
275                                         currentNode = index;
276                                         
277                                 }else{ //otherwise, error
278                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
279                                         break;
280                                 }
281                         }
282                         
283                         level++;
284                         
285                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
286                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
287                         }
288                 }
289                 return 0;
290         }
291         catch(exception& e) {
292                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
293                 exit(1);
294         }
295 }
296 /**************************************************************************************************/
297
298 int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
299         try {
300                 numSeqs++;
301                 
302                 map<string, int>::iterator childPointer;
303                 
304                 int currentNode = 0;
305                 string taxon;
306                 
307                 int level = 0;
308                 
309                 //are there confidence scores, if so remove them
310                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
311                 
312                 while (seqTaxonomy != "") {
313                         
314                         if (m->control_pressed) { return 0; }
315                         
316                         //somehow the parent is getting one too many accnos
317                         //use print to reassign the taxa id
318                         taxon = getNextTaxon(seqTaxonomy);
319                         
320                         childPointer = tree[currentNode].children.find(taxon);
321                         
322                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
323                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
324                     if (itGroup->second == true) {
325                         tree[childPointer->second].groupCount[itGroup->first]++;
326                     }
327                 }
328                                         
329                                 tree[childPointer->second].total++;
330                                 
331                                 currentNode = childPointer->second;
332                         }else{  
333                                 if (ignore) {
334                                         
335                                         tree.push_back(rawTaxNode(taxon));
336                                         int index = tree.size() - 1;
337                                         
338                                         tree[index].parent = currentNode;
339                                         tree[index].level = (level+1);
340                                         tree[index].total = 1;
341                                         tree[currentNode].children[taxon] = index;
342                                                 
343                     for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
344                         if (itGroup->second == true) {
345                             tree[index].groupCount[itGroup->first]++;
346                         }
347                     }
348                                         
349                                         currentNode = index;
350                                         
351                                 }else{ //otherwise, error
352                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
353                                         break;
354                                 }
355                         }
356                         
357                         level++;
358                         
359                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
360                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
361                         }
362                 }
363                 return 0;
364         }
365         catch(exception& e) {
366                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
367                 exit(1);
368         }
369 }
370
371 /**************************************************************************************************/
372
373 void PhyloSummary::assignRank(int index){
374         try {
375                 map<string,int>::iterator it;
376                 int counter = 1;
377                 
378                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
379                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
380                         counter++;
381                                                                         
382                         assignRank(it->second);
383                 }
384         }
385         catch(exception& e) {
386                 m->errorOut(e, "PhyloSummary", "assignRank");
387                 exit(1);
388         }
389 }
390 /**************************************************************************************************/
391
392 void PhyloSummary::print(ofstream& out){
393         try {
394                 
395                 if (ignore) { assignRank(0); }
396         vector<string> mGroups;
397                 //print labels
398                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
399                 if (groupmap != NULL) {
400                         //so the labels match the counts below, since the map sorts them automatically...
401                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
402             mGroups = groupmap->getNamesOfGroups();
403                         for (int i = 0; i < mGroups.size(); i++) {
404                                 out << mGroups[i] << '\t';
405                         }
406                 }else if (ct != NULL) {
407             if (ct->hasGroupInfo()) {
408                 mGroups = ct->getNamesOfGroups();
409                 for (int i = 0; i < mGroups.size(); i++) {
410                     out << mGroups[i] << '\t';
411                 }
412             }
413         }
414                 
415                 out << endl;
416                 
417                 int totalChildrenInTree = 0;
418                 map<string, int>::iterator itGroup;
419                 
420                 map<string,int>::iterator it;
421                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
422                         if (tree[it->second].total != 0)  {   
423                                 totalChildrenInTree++; 
424                                 tree[0].total += tree[it->second].total;
425                                 
426                                 if (groupmap != NULL) {
427                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
428                                 }else if ( ct != NULL) {
429                     if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
430                 }
431                         }
432                 }
433                 
434                 //print root
435                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
436                 
437                 
438                 if (groupmap != NULL) {
439                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; }  
440         }else if ( ct != NULL) {
441             if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } }
442         }
443                 out << endl;
444                 
445                 //print rest
446                 print(0, out);
447                 
448         }
449         catch(exception& e) {
450                 m->errorOut(e, "PhyloSummary", "print");
451                 exit(1);
452         }
453 }
454
455 /**************************************************************************************************/
456
457 void PhyloSummary::print(int i, ofstream& out){
458         try {
459                 map<string,int>::iterator it;
460                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
461                         
462                         if (tree[it->second].total != 0)  {
463                         
464                                 int totalChildrenInTree = 0;
465                 
466                                 map<string,int>::iterator it2;
467                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
468                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
469                                 }
470                         
471                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
472                                 
473                                 map<string, int>::iterator itGroup;
474                                 if (groupmap != NULL) {
475                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
476                                         //      out << itGroup->second << '\t';
477                                         //}
478                                         vector<string> mGroups = groupmap->getNamesOfGroups();
479                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
480                                 }else if (ct != NULL) {
481                     if (ct->hasGroupInfo()) {
482                         vector<string> mGroups = ct->getNamesOfGroups();
483                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
484                     }
485                 }
486                                 out << endl;
487                                 
488                         }
489                         
490                         print(it->second, out);
491                 }
492         }
493         catch(exception& e) {
494                 m->errorOut(e, "PhyloSummary", "print");
495                 exit(1);
496         }
497 }
498 /**************************************************************************************************/
499 void PhyloSummary::readTreeStruct(ifstream& in){
500         try {
501         
502                 //read version
503                 string line = m->getline(in); m->gobble(in);
504                 
505                 int num;
506                 
507                 in >> num; m->gobble(in);
508                 
509                 tree.resize(num);
510                 
511                 in >> maxLevel; m->gobble(in);
512         
513                 //read the tree file
514                 for (int i = 0; i < tree.size(); i++) {
515         
516                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
517                         
518                         //set children
519                         string childName;
520                         int childIndex;
521                         for (int j = 0; j < num; j++) {
522                                 in >> childName >> childIndex;
523                                 tree[i].children[childName] = childIndex;
524                         }
525                         
526                         //initialize groupcounts
527                         if (groupmap != NULL) {
528                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
529                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
530                                 }
531                         }else if (ct != NULL) {
532                 if (ct->hasGroupInfo()) {
533                     for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
534                         tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
535                     }
536                 }
537             }
538                         
539                         tree[i].total = 0;
540                         
541                         m->gobble(in);
542                         
543                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
544                 }
545
546         }
547         catch(exception& e) {
548                 m->errorOut(e, "PhyloSummary", "readTreeStruct");
549                 exit(1);
550         }
551 }
552 /**************************************************************************************************/
553
554
555