]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
added modify names parameter to set.dir
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11 #include "referencedb.h"
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, CountTable* c){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19         numSeqs = 0;
20                 
21                 ct = c;
22         groupmap = NULL;
23         
24                 //check for necessary files
25         if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
26                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
27                 ifstream FileTest(taxFileNameTest.c_str());
28                 
29                 if (!FileTest) { 
30                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
31                 }else{
32                         readTreeStruct(FileTest);
33                 }
34                 
35                 tree[0].rank = "0";
36                 assignRank(0);
37         
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
41                 exit(1);
42         }
43 }
44
45 /**************************************************************************************************/
46
47 PhyloSummary::PhyloSummary(CountTable* c){
48         try {
49                 m = MothurOut::getInstance();
50                 maxLevel = 0;
51                 ignore = true;
52         numSeqs = 0;
53                 
54                 ct = c;
55         groupmap = NULL;
56                 
57                 tree.push_back(rawTaxNode("Root"));
58                 tree[0].rank = "0";
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
62                 exit(1);
63         }
64 }
65 /**************************************************************************************************/
66 PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
67         try {
68                 m = MothurOut::getInstance();
69                 maxLevel = 0;
70                 ignore = false;
71         numSeqs = 0;
72                 
73                 groupmap = g;
74         ct = NULL;
75                                 
76                 //check for necessary files
77         if (refTfile == "saved") { ReferenceDB* rdb = ReferenceDB::getInstance(); refTfile = rdb->getSavedTaxonomy(); }
78                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
79                 ifstream FileTest(taxFileNameTest.c_str());
80                 
81                 if (!FileTest) { 
82                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
83                 }else{
84                         readTreeStruct(FileTest);
85                 }
86                 
87                 tree[0].rank = "0";
88                 assignRank(0);
89
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
93                 exit(1);
94         }
95 }
96
97 /**************************************************************************************************/
98
99 PhyloSummary::PhyloSummary(GroupMap* g){
100         try {
101                 m = MothurOut::getInstance();
102                 maxLevel = 0;
103                 ignore = true;
104         numSeqs = 0;
105                 
106                 groupmap = g;
107         ct = NULL;
108                 
109                 tree.push_back(rawTaxNode("Root"));
110                 tree[0].rank = "0";
111         }
112         catch(exception& e) {
113                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
114                 exit(1);
115         }
116 }
117 /**************************************************************************************************/
118
119 int PhyloSummary::summarize(string userTfile){
120         try {
121                 map<string, string> temp;
122         m->readTax(userTfile, temp);
123         
124         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
125             addSeqToTree(itTemp->first, itTemp->second);
126             temp.erase(itTemp++);
127         }
128         
129         return numSeqs;
130         }
131         catch(exception& e) {
132                 m->errorOut(e, "PhyloSummary", "summarize");
133                 exit(1);
134         }
135 }
136
137 /**************************************************************************************************/
138
139 string PhyloSummary::getNextTaxon(string& heirarchy){
140         try {
141                 string currentLevel = "";
142                 if(heirarchy != ""){
143                         int pos = heirarchy.find_first_of(';');
144                         currentLevel=heirarchy.substr(0,pos);
145                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
146                         else { heirarchy = ""; }
147                 }
148                 return currentLevel;
149         }
150         catch(exception& e) {
151                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
152                 exit(1);
153         }
154 }
155
156 /**************************************************************************************************/
157
158 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
159         try {
160                                 
161                 numSeqs++;
162                 
163                 map<string, int>::iterator childPointer;
164                 
165                 int currentNode = 0;
166                 string taxon;
167                 
168                 int level = 0;
169                 
170                 //are there confidence scores, if so remove them
171                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
172                 
173                 while (seqTaxonomy != "") {
174                         
175                         if (m->control_pressed) { return 0; }
176                         
177                         //somehow the parent is getting one too many accnos
178                         //use print to reassign the taxa id
179                         taxon = getNextTaxon(seqTaxonomy);
180                         
181                         childPointer = tree[currentNode].children.find(taxon);
182                         
183                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
184                                 int thisCount = 1;
185                 
186                 if (groupmap != NULL) {
187                                         //find out the sequences group
188                                         string group = groupmap->getGroup(seqName);
189                                         
190                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
191                                         
192                                         //do you have a count for this group?
193                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
194                                         
195                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
196                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
197                                                 tree[childPointer->second].groupCount[group]++;
198                                         }
199                                 }else if (ct != NULL) {
200                     if (ct->hasGroupInfo()) {
201                         vector<int> groupCounts = ct->getGroupCounts(seqName);
202                         vector<string> groups = ct->getNamesOfGroups();
203                         for (int i = 0; i < groups.size(); i++) {
204                             
205                             if (groupCounts[i] != 0) {
206                                 //do you have a count for this group?
207                                 map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
208                                 
209                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
210                                 if (itGroup != tree[childPointer->second].groupCount.end()) {
211                                     tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
212                                 }
213                             }
214                         }
215                     }
216                     thisCount = ct->getNumSeqs(seqName);
217                 }
218                                 
219                                 tree[childPointer->second].total += thisCount;
220
221                                 currentNode = childPointer->second;
222                         }else{  
223                                 if (ignore) {
224                                                 
225                                         tree.push_back(rawTaxNode(taxon));
226                                         int index = tree.size() - 1;
227                                 
228                                         tree[index].parent = currentNode;
229                                         tree[index].level = (level+1);
230                                         tree[currentNode].children[taxon] = index;
231                     int thisCount = 1;
232                                         
233                                         //initialize groupcounts
234                                         if (groupmap != NULL) {
235                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
236                                                 for (int j = 0; j < mGroups.size(); j++) {
237                                                         tree[index].groupCount[mGroups[j]] = 0;
238                                                 }
239                                                 
240                                                 //find out the sequences group
241                                                 string group = groupmap->getGroup(seqName);
242                                                 
243                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
244                                                 
245                                                 //do you have a count for this group?
246                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
247                                                 
248                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
249                                                 if (itGroup != tree[index].groupCount.end()) {
250                                                         tree[index].groupCount[group]++;
251                                                 }
252                                         }else if (ct != NULL) {
253                         if (ct->hasGroupInfo()) {
254                             vector<string> mGroups = ct->getNamesOfGroups();
255                             for (int j = 0; j < mGroups.size(); j++) {
256                                 tree[index].groupCount[mGroups[j]] = 0;
257                             }
258                             vector<int> groupCounts = ct->getGroupCounts(seqName);
259                             vector<string> groups = ct->getNamesOfGroups();
260                         
261                             for (int i = 0; i < groups.size(); i++) {
262                                 if (groupCounts[i] != 0) {
263                                    
264                                     //do you have a count for this group?
265                                     map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
266                                      
267                                     //if yes, increment it - there should not be a case where we can't find it since we load group in read
268                                     if (itGroup != tree[index].groupCount.end()) {
269                                         tree[index].groupCount[groups[i]]+=groupCounts[i];
270                                     }
271                                 }
272                             }
273                         }
274                         thisCount = ct->getNumSeqs(seqName);
275                     }
276                                         
277                     tree[index].total = thisCount;
278                                         currentNode = index;
279                                         
280                                 }else{ //otherwise, error
281                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
282                                         break;
283                                 }
284                         }
285                         
286                         level++;
287                         
288                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
289                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
290                         }
291                 }
292                 return 0;
293         }
294         catch(exception& e) {
295                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
296                 exit(1);
297         }
298 }
299 /**************************************************************************************************/
300
301 int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
302         try {
303                 numSeqs++;
304                 
305                 map<string, int>::iterator childPointer;
306                 
307                 int currentNode = 0;
308                 string taxon;
309                 
310                 int level = 0;
311                 
312                 //are there confidence scores, if so remove them
313                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
314                 
315                 while (seqTaxonomy != "") {
316                         
317                         if (m->control_pressed) { return 0; }
318                         
319                         //somehow the parent is getting one too many accnos
320                         //use print to reassign the taxa id
321                         taxon = getNextTaxon(seqTaxonomy);
322                         
323                         childPointer = tree[currentNode].children.find(taxon);
324                         
325                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
326                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
327                     if (itGroup->second == true) {
328                         tree[childPointer->second].groupCount[itGroup->first]++;
329                     }
330                 }
331                                         
332                                 tree[childPointer->second].total++;
333                                 
334                                 currentNode = childPointer->second;
335                         }else{  
336                                 if (ignore) {
337                                         
338                                         tree.push_back(rawTaxNode(taxon));
339                                         int index = tree.size() - 1;
340                                         
341                                         tree[index].parent = currentNode;
342                                         tree[index].level = (level+1);
343                                         tree[index].total = 1;
344                                         tree[currentNode].children[taxon] = index;
345                                                 
346                     for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
347                         if (itGroup->second == true) {
348                             tree[index].groupCount[itGroup->first]++;
349                         }
350                     }
351                                         
352                                         currentNode = index;
353                                         
354                                 }else{ //otherwise, error
355                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
356                                         break;
357                                 }
358                         }
359                         
360                         level++;
361                         
362                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
363                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
364                         }
365                 }
366                 return 0;
367         }
368         catch(exception& e) {
369                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
370                 exit(1);
371         }
372 }
373
374 /**************************************************************************************************/
375
376 void PhyloSummary::assignRank(int index){
377         try {
378                 map<string,int>::iterator it;
379                 int counter = 1;
380                 
381                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
382                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
383                         counter++;
384                                                                         
385                         assignRank(it->second);
386                 }
387         }
388         catch(exception& e) {
389                 m->errorOut(e, "PhyloSummary", "assignRank");
390                 exit(1);
391         }
392 }
393 /**************************************************************************************************/
394
395 void PhyloSummary::print(ofstream& out){
396         try {
397                 
398                 if (ignore) { assignRank(0); }
399         vector<string> mGroups;
400                 //print labels
401                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
402                 if (groupmap != NULL) {
403                         //so the labels match the counts below, since the map sorts them automatically...
404                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
405             mGroups = groupmap->getNamesOfGroups();
406                         for (int i = 0; i < mGroups.size(); i++) {
407                                 out << mGroups[i] << '\t';
408                         }
409                 }else if (ct != NULL) {
410             if (ct->hasGroupInfo()) {
411                 mGroups = ct->getNamesOfGroups();
412                 for (int i = 0; i < mGroups.size(); i++) {
413                     out << mGroups[i] << '\t';
414                 }
415             }
416         }
417                 
418                 out << endl;
419                 
420                 int totalChildrenInTree = 0;
421                 map<string, int>::iterator itGroup;
422                 
423                 map<string,int>::iterator it;
424                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
425                         if (tree[it->second].total != 0)  {   
426                                 totalChildrenInTree++; 
427                                 tree[0].total += tree[it->second].total;
428                                 
429                                 if (groupmap != NULL) {
430                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
431                                 }else if ( ct != NULL) {
432                     if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
433                 }
434                         }
435                 }
436                 
437                 //print root
438                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
439                 
440                 
441                 if (groupmap != NULL) {
442                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; }  
443         }else if ( ct != NULL) {
444             if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } }
445         }
446                 out << endl;
447                 
448                 //print rest
449                 print(0, out);
450                 
451         }
452         catch(exception& e) {
453                 m->errorOut(e, "PhyloSummary", "print");
454                 exit(1);
455         }
456 }
457
458 /**************************************************************************************************/
459
460 void PhyloSummary::print(int i, ofstream& out){
461         try {
462                 map<string,int>::iterator it;
463                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
464                         
465                         if (tree[it->second].total != 0)  {
466                         
467                                 int totalChildrenInTree = 0;
468                 
469                                 map<string,int>::iterator it2;
470                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
471                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
472                                 }
473                         
474                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
475                                 
476                                 map<string, int>::iterator itGroup;
477                                 if (groupmap != NULL) {
478                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
479                                         //      out << itGroup->second << '\t';
480                                         //}
481                                         vector<string> mGroups = groupmap->getNamesOfGroups();
482                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
483                                 }else if (ct != NULL) {
484                     if (ct->hasGroupInfo()) {
485                         vector<string> mGroups = ct->getNamesOfGroups();
486                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
487                     }
488                 }
489                                 out << endl;
490                                 
491                         }
492                         
493                         print(it->second, out);
494                 }
495         }
496         catch(exception& e) {
497                 m->errorOut(e, "PhyloSummary", "print");
498                 exit(1);
499         }
500 }
501 /**************************************************************************************************/
502 void PhyloSummary::readTreeStruct(ifstream& in){
503         try {
504         
505                 //read version
506                 string line = m->getline(in); m->gobble(in);
507                 
508                 int num;
509                 
510                 in >> num; m->gobble(in);
511                 
512                 tree.resize(num);
513                 
514                 in >> maxLevel; m->gobble(in);
515         
516                 //read the tree file
517                 for (int i = 0; i < tree.size(); i++) {
518         
519                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
520                         
521                         //set children
522                         string childName;
523                         int childIndex;
524                         for (int j = 0; j < num; j++) {
525                                 in >> childName >> childIndex;
526                                 tree[i].children[childName] = childIndex;
527                         }
528                         
529                         //initialize groupcounts
530                         if (groupmap != NULL) {
531                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
532                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
533                                 }
534                         }else if (ct != NULL) {
535                 if (ct->hasGroupInfo()) {
536                     for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
537                         tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
538                     }
539                 }
540             }
541                         
542                         tree[i].total = 0;
543                         
544                         m->gobble(in);
545                         
546                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
547                 }
548
549         }
550         catch(exception& e) {
551                 m->errorOut(e, "PhyloSummary", "readTreeStruct");
552                 exit(1);
553         }
554 }
555 /**************************************************************************************************/
556
557
558