]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
working on get.coremicrobiom
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19                 
20                 if (groupFile != "") {
21                         groupmap = new GroupMap(groupFile);
22                         groupmap->readMap();
23                 }else{
24                         groupmap = NULL;
25                 }
26                                 
27                 //check for necessary files
28                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
29                 ifstream FileTest(taxFileNameTest.c_str());
30                 
31                 if (!FileTest) { 
32                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33                 }else{
34                         readTreeStruct(FileTest);
35                 }
36                 
37                 tree[0].rank = "0";
38                 assignRank(0);
39
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43                 exit(1);
44         }
45 }
46
47 /**************************************************************************************************/
48
49 PhyloSummary::PhyloSummary(string groupFile){
50         try {
51                 m = MothurOut::getInstance();
52                 maxLevel = 0;
53                 ignore = true;
54                 
55                 if (groupFile != "") {
56                         groupmap = new GroupMap(groupFile);
57                         groupmap->readMap();
58                 }else{
59                         groupmap = NULL;
60                 }
61                 
62                 tree.push_back(rawTaxNode("Root"));
63                 tree[0].rank = "0";
64                 
65                 
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
69                 exit(1);
70         }
71 }
72 /**************************************************************************************************/
73
74 int PhyloSummary::summarize(string userTfile){
75         try {
76                 
77                 ifstream in;
78                 m->openInputFile(userTfile, in);
79                 
80                 //read in users taxonomy file and add sequences to tree
81                 string name, tax;
82                 int numSeqs = 0;
83                 while(!in.eof()){
84                         in >> name >> tax; m->gobble(in);
85                         
86                         addSeqToTree(name, tax);
87                         numSeqs++;
88                         
89                         if (m->control_pressed) { break;  }
90                 }
91                 in.close();
92                 
93                 return numSeqs;
94         }
95         catch(exception& e) {
96                 m->errorOut(e, "PhyloSummary", "summarize");
97                 exit(1);
98         }
99 }
100
101 /**************************************************************************************************/
102
103 string PhyloSummary::getNextTaxon(string& heirarchy){
104         try {
105                 string currentLevel = "";
106                 if(heirarchy != ""){
107                         int pos = heirarchy.find_first_of(';');
108                         currentLevel=heirarchy.substr(0,pos);
109                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
110                         else { heirarchy = ""; }
111                 }
112                 return currentLevel;
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
116                 exit(1);
117         }
118 }
119
120 /**************************************************************************************************/
121
122 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
123         try {
124                                 
125                 numSeqs++;
126                 
127                 map<string, int>::iterator childPointer;
128                 
129                 int currentNode = 0;
130                 string taxon;
131                 
132                 int level = 0;
133                 
134                 //are there confidence scores, if so remove them
135                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
136                 
137                 while (seqTaxonomy != "") {
138                         
139                         if (m->control_pressed) { return 0; }
140                         
141                         //somehow the parent is getting one too many accnos
142                         //use print to reassign the taxa id
143                         taxon = getNextTaxon(seqTaxonomy);
144                         
145                         childPointer = tree[currentNode].children.find(taxon);
146                         
147                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
148                                 if (groupmap != NULL) {
149                                         //find out the sequences group
150                                         string group = groupmap->getGroup(seqName);
151                                         
152                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
153                                         
154                                         //do you have a count for this group?
155                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
156                                         
157                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
158                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
159                                                 tree[childPointer->second].groupCount[group]++;
160                                         }
161                                 }
162                                 
163                                 tree[childPointer->second].total++;
164
165                                 currentNode = childPointer->second;
166                         }else{  
167                                 if (ignore) {
168                                                 
169                                         tree.push_back(rawTaxNode(taxon));
170                                         int index = tree.size() - 1;
171                                 
172                                         tree[index].parent = currentNode;
173                                         tree[index].level = (level+1);
174                                         tree[index].total = 1;
175                                         tree[currentNode].children[taxon] = index;
176                                         
177                                         //initialize groupcounts
178                                         if (groupmap != NULL) {
179                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
180                                                 for (int j = 0; j < mGroups.size(); j++) {
181                                                         tree[index].groupCount[mGroups[j]] = 0;
182                                                 }
183                                                 
184                                                 //find out the sequences group
185                                                 string group = groupmap->getGroup(seqName);
186                                                 
187                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
188                                                 
189                                                 //do you have a count for this group?
190                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
191                                                 
192                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
193                                                 if (itGroup != tree[index].groupCount.end()) {
194                                                         tree[index].groupCount[group]++;
195                                                 }                                               
196                                         }
197                                         
198                                         currentNode = index;
199                                         
200                                 }else{ //otherwise, error
201                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
202                                         break;
203                                 }
204                         }
205                         
206                         level++;
207                         
208                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
209                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
210                         }
211                 }
212                 return 0;
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
216                 exit(1);
217         }
218 }
219 /**************************************************************************************************/
220
221 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
222         try {
223                 numSeqs++;
224                 
225                 map<string, int>::iterator childPointer;
226                 
227                 int currentNode = 0;
228                 string taxon;
229                 
230                 int level = 0;
231                 
232                 //are there confidence scores, if so remove them
233                 if (seqTaxonomy.find_first_of('(') != -1) {  m->removeConfidences(seqTaxonomy); }
234                 
235                 while (seqTaxonomy != "") {
236                         
237                         if (m->control_pressed) { return 0; }
238                         
239                         //somehow the parent is getting one too many accnos
240                         //use print to reassign the taxa id
241                         taxon = getNextTaxon(seqTaxonomy);
242                         
243                         childPointer = tree[currentNode].children.find(taxon);
244                         
245                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
246                                 if (groupmap != NULL) {
247                                         
248                                         map<string, bool> containsGroup; 
249                                         vector<string> mGroups = groupmap->getNamesOfGroups();
250                                         for (int j = 0; j < mGroups.size(); j++) {
251                                                 containsGroup[mGroups[j]] = false;
252                                         }
253                                         
254                                         for (int k = 0; k < names.size(); k++) {
255                                                 //find out the sequences group
256                                                 string group = groupmap->getGroup(names[k]);
257                                         
258                                                 if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
259                                                 else {
260                                                         containsGroup[group] = true;
261                                                 }
262                                         }
263                                         
264                                         for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
265                                                 if (itGroup->second == true) {
266                                                         tree[childPointer->second].groupCount[itGroup->first]++;
267                                                 }
268                                         }
269                                         
270                                 }
271                                 
272                                 tree[childPointer->second].total++;
273                                 
274                                 currentNode = childPointer->second;
275                         }else{  
276                                 if (ignore) {
277                                         
278                                         tree.push_back(rawTaxNode(taxon));
279                                         int index = tree.size() - 1;
280                                         
281                                         tree[index].parent = currentNode;
282                                         tree[index].level = (level+1);
283                                         tree[index].total = 1;
284                                         tree[currentNode].children[taxon] = index;
285                                         
286                                         //initialize groupcounts
287                                         if (groupmap != NULL) {
288                                                 map<string, bool> containsGroup; 
289                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
290                                                 for (int j = 0; j < mGroups.size(); j++) {
291                                                         tree[index].groupCount[mGroups[j]] = 0;
292                                                         containsGroup[mGroups[j]] = false;
293                                                 }
294                                                 
295                                                 
296                                                 for (int k = 0; k < names.size(); k++) {
297                                                         //find out the sequences group
298                                                         string group = groupmap->getGroup(names[k]);
299                                                         
300                                                         if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
301                                                         else {
302                                                                 containsGroup[group] = true;
303                                                         }
304                                                 }
305                                                 
306                                                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
307                                                         if (itGroup->second == true) {
308                                                                 tree[index].groupCount[itGroup->first]++;
309                                                         }
310                                                 }
311                                         }
312                                         
313                                         currentNode = index;
314                                         
315                                 }else{ //otherwise, error
316                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
317                                         break;
318                                 }
319                         }
320                         
321                         level++;
322                         
323                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
324                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
325                         }
326                 }
327                 return 0;
328         }
329         catch(exception& e) {
330                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
331                 exit(1);
332         }
333 }
334
335 /**************************************************************************************************/
336
337 void PhyloSummary::assignRank(int index){
338         try {
339                 map<string,int>::iterator it;
340                 int counter = 1;
341                 
342                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
343                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
344                         counter++;
345                                                                         
346                         assignRank(it->second);
347                 }
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "PhyloSummary", "assignRank");
351                 exit(1);
352         }
353 }
354 /**************************************************************************************************/
355
356 void PhyloSummary::print(ofstream& out){
357         try {
358                 
359                 if (ignore) { assignRank(0); }
360         
361                 //print labels
362                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
363                 if (groupmap != NULL) {
364                         //so the labels match the counts below, since the map sorts them automatically...
365                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
366                         vector<string> mGroups = groupmap->getNamesOfGroups();
367                         for (int i = 0; i < mGroups.size(); i++) {
368                                 out << mGroups[i] << '\t';
369                         }
370                 }
371                 
372                 out << endl;
373                 
374                 int totalChildrenInTree = 0;
375                 map<string, int>::iterator itGroup;
376                 
377                 map<string,int>::iterator it;
378                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
379                         if (tree[it->second].total != 0)  {   
380                                 totalChildrenInTree++; 
381                                 tree[0].total += tree[it->second].total;
382                                 
383                                 if (groupmap != NULL) {
384                                         vector<string> mGroups = groupmap->getNamesOfGroups();
385                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
386                                 }
387                         }
388                 }
389                 
390                 //print root
391                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
392                 
393                 
394                 if (groupmap != NULL) {
395                         //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
396                         //      out << itGroup->second << '\t';
397                         //}
398                         vector<string> mGroups = groupmap->getNamesOfGroups();
399                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } 
400                 }
401                 out << endl;
402                 
403                 //print rest
404                 print(0, out);
405                 
406         }
407         catch(exception& e) {
408                 m->errorOut(e, "PhyloSummary", "print");
409                 exit(1);
410         }
411 }
412
413 /**************************************************************************************************/
414
415 void PhyloSummary::print(int i, ofstream& out){
416         try {
417                 map<string,int>::iterator it;
418                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
419                         
420                         if (tree[it->second].total != 0)  {
421                         
422                                 int totalChildrenInTree = 0;
423                 
424                                 map<string,int>::iterator it2;
425                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
426                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
427                                 }
428                         
429                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
430                                 
431                                 map<string, int>::iterator itGroup;
432                                 if (groupmap != NULL) {
433                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
434                                         //      out << itGroup->second << '\t';
435                                         //}
436                                         vector<string> mGroups = groupmap->getNamesOfGroups();
437                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
438                                 }
439                                 out << endl;
440                                 
441                         }
442                         
443                         print(it->second, out);
444                 }
445         }
446         catch(exception& e) {
447                 m->errorOut(e, "PhyloSummary", "print");
448                 exit(1);
449         }
450 }
451 /**************************************************************************************************/
452 void PhyloSummary::readTreeStruct(ifstream& in){
453         try {
454         
455                 //read version
456                 string line = m->getline(in); m->gobble(in);
457                 
458                 int num;
459                 
460                 in >> num; m->gobble(in);
461                 
462                 tree.resize(num);
463                 
464                 in >> maxLevel; m->gobble(in);
465         
466                 //read the tree file
467                 for (int i = 0; i < tree.size(); i++) {
468         
469                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
470                         
471                         //set children
472                         string childName;
473                         int childIndex;
474                         for (int j = 0; j < num; j++) {
475                                 in >> childName >> childIndex;
476                                 tree[i].children[childName] = childIndex;
477                         }
478                         
479                         //initialize groupcounts
480                         if (groupmap != NULL) {
481                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
482                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
483                                 }
484                         }
485                         
486                         tree[i].total = 0;
487                         
488                         m->gobble(in);
489                         
490                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
491                 }
492
493         }
494         catch(exception& e) {
495                 m->errorOut(e, "PhyloSummary", "readTreeStruct");
496                 exit(1);
497         }
498 }
499 /**************************************************************************************************/
500
501
502