]> git.donarmstrong.com Git - mothur.git/blob - phylosummary.cpp
added multiple processors option for Windows users to align.seqs, dist.seqs, summary...
[mothur.git] / phylosummary.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylosummary.h"
11
12 /**************************************************************************************************/
13
14 PhyloSummary::PhyloSummary(string refTfile, string groupFile){
15         try {
16                 m = MothurOut::getInstance();
17                 maxLevel = 0;
18                 ignore = false;
19                 
20                 if (groupFile != "") {
21                         groupmap = new GroupMap(groupFile);
22                         groupmap->readMap();
23                 }else{
24                         groupmap = NULL;
25                 }
26                                 
27                 //check for necessary files
28                 string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
29                 ifstream FileTest(taxFileNameTest.c_str());
30                 
31                 if (!FileTest) { 
32                         m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
33                 }else{
34                         readTreeStruct(FileTest);
35                 }
36                 
37                 tree[0].rank = "0";
38                 assignRank(0);
39
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
43                 exit(1);
44         }
45 }
46
47 /**************************************************************************************************/
48
49 PhyloSummary::PhyloSummary(string groupFile){
50         try {
51                 m = MothurOut::getInstance();
52                 maxLevel = 0;
53                 ignore = true;
54                 
55                 if (groupFile != "") {
56                         groupmap = new GroupMap(groupFile);
57                         groupmap->readMap();
58                 }else{
59                         groupmap = NULL;
60                 }
61                 
62                 tree.push_back(rawTaxNode("Root"));
63                 tree[0].rank = "0";
64                 
65                 
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
69                 exit(1);
70         }
71 }
72 /**************************************************************************************************/
73
74 void PhyloSummary::summarize(string userTfile){
75         try {
76                 
77                 ifstream in;
78                 m->openInputFile(userTfile, in);
79                 
80                 //read in users taxonomy file and add sequences to tree
81                 string name, tax;
82                 while(!in.eof()){
83                         in >> name >> tax; m->gobble(in);
84                         
85                         addSeqToTree(name, tax);
86                         
87                         if (m->control_pressed) { break;  }
88                 }
89                 in.close();
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "PhyloSummary", "summarize");
93                 exit(1);
94         }
95 }
96
97 /**************************************************************************************************/
98
99 string PhyloSummary::getNextTaxon(string& heirarchy){
100         try {
101                 string currentLevel = "";
102                 if(heirarchy != ""){
103                         int pos = heirarchy.find_first_of(';');
104                         currentLevel=heirarchy.substr(0,pos);
105                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
106                         else { heirarchy = ""; }
107                 }
108                 return currentLevel;
109         }
110         catch(exception& e) {
111                 m->errorOut(e, "PhyloSummary", "getNextTaxon");
112                 exit(1);
113         }
114 }
115
116 /**************************************************************************************************/
117
118 int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
119         try {
120                 numSeqs++;
121                 
122                 map<string, int>::iterator childPointer;
123                 
124                 int currentNode = 0;
125                 string taxon;
126                 
127                 int level = 0;
128                 
129                 while (seqTaxonomy != "") {
130                         
131                         if (m->control_pressed) { return 0; }
132                         
133                         //somehow the parent is getting one too many accnos
134                         //use print to reassign the taxa id
135                         taxon = getNextTaxon(seqTaxonomy);
136                         
137                         childPointer = tree[currentNode].children.find(taxon);
138                         
139                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
140                                 if (groupmap != NULL) {
141                                         //find out the sequences group
142                                         string group = groupmap->getGroup(seqName);
143                                         
144                                         if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
145                                         
146                                         //do you have a count for this group?
147                                         map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(group);
148                                         
149                                         //if yes, increment it - there should not be a case where we can't find it since we load group in read
150                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
151                                                 tree[childPointer->second].groupCount[group]++;
152                                         }
153                                 }
154                                 
155                                 tree[childPointer->second].total++;
156
157                                 currentNode = childPointer->second;
158                         }else{  
159                                 if (ignore) {
160                                                 
161                                         tree.push_back(rawTaxNode(taxon));
162                                         int index = tree.size() - 1;
163                                 
164                                         tree[index].parent = currentNode;
165                                         tree[index].level = (level+1);
166                                         tree[index].total = 1;
167                                         tree[currentNode].children[taxon] = index;
168                                         
169                                         //initialize groupcounts
170                                         if (groupmap != NULL) {
171                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
172                                                 for (int j = 0; j < mGroups.size(); j++) {
173                                                         tree[index].groupCount[mGroups[j]] = 0;
174                                                 }
175                                                 
176                                                 //find out the sequences group
177                                                 string group = groupmap->getGroup(seqName);
178                                                 
179                                                 if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
180                                                 
181                                                 //do you have a count for this group?
182                                                 map<string, int>::iterator itGroup = tree[index].groupCount.find(group);
183                                                 
184                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
185                                                 if (itGroup != tree[index].groupCount.end()) {
186                                                         tree[index].groupCount[group]++;
187                                                 }                                               
188                                         }
189                                         
190                                         currentNode = index;
191                                         
192                                 }else{ //otherwise, error
193                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + " for " + seqName + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
194                                         break;
195                                 }
196                         }
197                         
198                         level++;
199                         
200                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
201                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
202                         }
203                 }
204                 return 0;
205         }
206         catch(exception& e) {
207                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
208                 exit(1);
209         }
210 }
211 /**************************************************************************************************/
212
213 int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
214         try {
215                 numSeqs++;
216                 
217                 map<string, int>::iterator childPointer;
218                 
219                 int currentNode = 0;
220                 string taxon;
221                 
222                 int level = 0;
223                 
224                 while (seqTaxonomy != "") {
225                         
226                         if (m->control_pressed) { return 0; }
227                         
228                         //somehow the parent is getting one too many accnos
229                         //use print to reassign the taxa id
230                         taxon = getNextTaxon(seqTaxonomy);
231                         
232                         childPointer = tree[currentNode].children.find(taxon);
233                         
234                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
235                                 if (groupmap != NULL) {
236                                         
237                                         map<string, bool> containsGroup; 
238                                         vector<string> mGroups = groupmap->getNamesOfGroups();
239                                         for (int j = 0; j < mGroups.size(); j++) {
240                                                 containsGroup[mGroups[j]] = false;
241                                         }
242                                         
243                                         for (int k = 0; k < names.size(); k++) {
244                                                 //find out the sequences group
245                                                 string group = groupmap->getGroup(names[k]);
246                                         
247                                                 if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
248                                                 else {
249                                                         containsGroup[group] = true;
250                                                 }
251                                         }
252                                         
253                                         for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
254                                                 if (itGroup->second == true) {
255                                                         tree[childPointer->second].groupCount[itGroup->first]++;
256                                                 }
257                                         }
258                                         
259                                 }
260                                 
261                                 tree[childPointer->second].total++;
262                                 
263                                 currentNode = childPointer->second;
264                         }else{  
265                                 if (ignore) {
266                                         
267                                         tree.push_back(rawTaxNode(taxon));
268                                         int index = tree.size() - 1;
269                                         
270                                         tree[index].parent = currentNode;
271                                         tree[index].level = (level+1);
272                                         tree[index].total = 1;
273                                         tree[currentNode].children[taxon] = index;
274                                         
275                                         //initialize groupcounts
276                                         if (groupmap != NULL) {
277                                                 map<string, bool> containsGroup; 
278                                                 vector<string> mGroups = groupmap->getNamesOfGroups();
279                                                 for (int j = 0; j < mGroups.size(); j++) {
280                                                         tree[index].groupCount[mGroups[j]] = 0;
281                                                         containsGroup[mGroups[j]] = false;
282                                                 }
283                                                 
284                                                 
285                                                 for (int k = 0; k < names.size(); k++) {
286                                                         //find out the sequences group
287                                                         string group = groupmap->getGroup(names[k]);
288                                                         
289                                                         if (group == "not found") {  m->mothurOut(names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
290                                                         else {
291                                                                 containsGroup[group] = true;
292                                                         }
293                                                 }
294                                                 
295                                                 for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
296                                                         if (itGroup->second == true) {
297                                                                 tree[index].groupCount[itGroup->first]++;
298                                                         }
299                                                 }
300                                         }
301                                         
302                                         currentNode = index;
303                                         
304                                 }else{ //otherwise, error
305                                         m->mothurOut("Warning: cannot find taxon " + taxon + " in reference taxonomy tree at level " + toString(tree[currentNode].level) + ". This may cause totals of daughter levels not to add up in summary file."); m->mothurOutEndLine();
306                                         break;
307                                 }
308                         }
309                         
310                         level++;
311                         
312                         if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
313                                 for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
314                         }
315                 }
316                 return 0;
317         }
318         catch(exception& e) {
319                 m->errorOut(e, "PhyloSummary", "addSeqToTree");
320                 exit(1);
321         }
322 }
323
324 /**************************************************************************************************/
325
326 void PhyloSummary::assignRank(int index){
327         try {
328                 map<string,int>::iterator it;
329                 int counter = 1;
330                 
331                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
332                         tree[it->second].rank = tree[index].rank + '.' + toString(counter);
333                         counter++;
334                                                                         
335                         assignRank(it->second);
336                 }
337         }
338         catch(exception& e) {
339                 m->errorOut(e, "PhyloSummary", "assignRank");
340                 exit(1);
341         }
342 }
343 /**************************************************************************************************/
344
345 void PhyloSummary::print(ofstream& out){
346         try {
347                 
348                 if (ignore) { assignRank(0); }
349         
350                 //print labels
351                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
352                 if (groupmap != NULL) {
353                         //so the labels match the counts below, since the map sorts them automatically...
354                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
355                         vector<string> mGroups = groupmap->getNamesOfGroups();
356                         for (int i = 0; i < mGroups.size(); i++) {
357                                 out << mGroups[i] << '\t';
358                         }
359                 }
360                 
361                 out << endl;
362                 
363                 int totalChildrenInTree = 0;
364                 
365                 map<string,int>::iterator it;
366                 for(it=tree[0].children.begin();it!=tree[0].children.end();it++){   
367                         if (tree[it->second].total != 0)  {   totalChildrenInTree++; }
368                 }
369                 
370                 //print root
371                 out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << totalChildrenInTree << "\t" << tree[0].total << "\t";
372                 
373                 map<string, int>::iterator itGroup;
374                 if (groupmap != NULL) {
375                         //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
376                         //      out << itGroup->second << '\t';
377                         //}
378                         vector<string> mGroups = groupmap->getNamesOfGroups();
379                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } 
380                 }
381                 out << endl;
382                 
383                 //print rest
384                 print(0, out);
385                 
386         }
387         catch(exception& e) {
388                 m->errorOut(e, "PhyloSummary", "print");
389                 exit(1);
390         }
391 }
392
393 /**************************************************************************************************/
394
395 void PhyloSummary::print(int i, ofstream& out){
396         try {
397                 map<string,int>::iterator it;
398                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
399                         
400                         if (tree[it->second].total != 0)  {
401                         
402                                 int totalChildrenInTree = 0;
403                 
404                                 map<string,int>::iterator it2;
405                                 for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
406                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
407                                 }
408                         
409                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
410                                 
411                                 map<string, int>::iterator itGroup;
412                                 if (groupmap != NULL) {
413                                         //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
414                                         //      out << itGroup->second << '\t';
415                                         //}
416                                         vector<string> mGroups = groupmap->getNamesOfGroups();
417                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
418                                 }
419                                 out << endl;
420                                 
421                         }
422                         
423                         print(it->second, out);
424                 }
425         }
426         catch(exception& e) {
427                 m->errorOut(e, "PhyloSummary", "print");
428                 exit(1);
429         }
430 }
431 /**************************************************************************************************/
432 void PhyloSummary::readTreeStruct(ifstream& in){
433         try {
434         
435                 //read version
436                 string line = m->getline(in); m->gobble(in);
437                 
438                 int num;
439                 
440                 in >> num; m->gobble(in);
441                 
442                 tree.resize(num);
443                 
444                 in >> maxLevel; m->gobble(in);
445         
446                 //read the tree file
447                 for (int i = 0; i < tree.size(); i++) {
448         
449                         in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
450                         
451                         //set children
452                         string childName;
453                         int childIndex;
454                         for (int j = 0; j < num; j++) {
455                                 in >> childName >> childIndex;
456                                 tree[i].children[childName] = childIndex;
457                         }
458                         
459                         //initialize groupcounts
460                         if (groupmap != NULL) {
461                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
462                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
463                                 }
464                         }
465                         
466                         tree[i].total = 0;
467                         
468                         m->gobble(in);
469                         
470                         //if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
471                 }
472
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "PhyloSummary", "print");
476                 exit(1);
477         }
478 }
479
480 /**************************************************************************************************/
481
482
483