]> git.donarmstrong.com Git - mothur.git/blob - phylotree.cpp
made classifier faster
[mothur.git] / phylotree.cpp
1 /*
2  *  doTaxonomy.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 6/17/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "phylotree.h"
11
12 /**************************************************************************************************/
13
14 PhyloTree::PhyloTree(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(TaxNode("Root"));
20                 tree[0].heirarchyID = "0";
21                 maxLevel = 0;
22                 calcTotals = true;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "PhyloTree", "PhyloTree");
26                 exit(1);
27         }
28 }
29 /**************************************************************************************************/
30
31 PhyloTree::PhyloTree(ifstream& in){
32         try {
33                 m = MothurOut::getInstance();
34                 calcTotals = false;
35                 
36                 in >> numNodes; gobble(in);
37                 
38                 tree.resize(numNodes);
39                 
40                 for (int i = 0; i < tree.size(); i++) {
41                         in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
42  
43                 }
44                 
45                 //read genus nodes
46                 int numGenus = 0;
47                 in >> numGenus; gobble(in);
48         
49                 int gnode, gsize;
50                 totals.clear();
51                 for (int i = 0; i < numGenus; i++) {
52                         in >> gnode >> gsize; gobble(in);
53                         
54                         uniqueTaxonomies[gnode] = gnode;
55                         totals.push_back(gsize);
56                 }
57                         
58                 in.close();
59                 
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "PhyloTree", "PhyloTree");
63                 exit(1);
64         }
65 }
66 /**************************************************************************************************/
67
68 PhyloTree::PhyloTree(string tfile){
69         try {
70                 m = MothurOut::getInstance();
71                 numNodes = 1;
72                 numSeqs = 0;
73                 tree.push_back(TaxNode("Root"));
74                 tree[0].heirarchyID = "0";
75                 maxLevel = 0;
76                 calcTotals = true;
77                 
78                 ifstream in;
79                 openInputFile(tfile, in);
80                 
81                 //read in users taxonomy file and add sequences to tree
82                 string name, tax;
83                 while(!in.eof()){
84                         in >> name >> tax; gobble(in);
85                         
86                         addSeqToTree(name, tax);
87                 }
88                 in.close();
89         
90                 assignHeirarchyIDs(0);
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "PhyloTree", "PhyloTree");
94                 exit(1);
95         }
96 }
97
98 /**************************************************************************************************/
99
100 string PhyloTree::getNextTaxon(string& heirarchy){
101         try {
102                 string currentLevel = "";
103                 if(heirarchy != ""){
104                         int pos = heirarchy.find_first_of(';');
105                         currentLevel=heirarchy.substr(0,pos);
106                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
107                         else { heirarchy = ""; }
108                 }
109                 return currentLevel;
110         }
111         catch(exception& e) {
112                 m->errorOut(e, "PhyloTree", "getNextTaxon");
113                 exit(1);
114         }
115 }
116
117 /**************************************************************************************************/
118
119 int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
120         try {
121                 numSeqs++;
122                 
123                 map<string, int>::iterator childPointer;
124                 
125                 int currentNode = 0;
126                 int level = 1;
127                 
128                 tree[0].accessions.push_back(seqName);
129                 string taxon;// = getNextTaxon(seqTaxonomy);
130                 
131                 while(seqTaxonomy != ""){
132                         
133                         level++;
134                         
135                         if (m->control_pressed) { return 0; }
136                         
137                         //somehow the parent is getting one too many accnos
138                         //use print to reassign the taxa id
139                         taxon = getNextTaxon(seqTaxonomy);
140                         
141                         childPointer = tree[currentNode].children.find(taxon);
142                         
143                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
144                                 currentNode = childPointer->second;
145                                 tree[currentNode].accessions.push_back(seqName);
146                                 name2Taxonomy[seqName] = currentNode;
147                         }
148                         else{                                                                                   //otherwise, create it
149                                 tree.push_back(TaxNode(taxon));
150                                 numNodes++;
151                                 tree[currentNode].children[taxon] = numNodes-1;
152                                 tree[numNodes-1].parent = currentNode;
153                                 
154                                 //                      int numChildren = tree[currentNode].children.size();
155                                 //                      string heirarchyID = tree[currentNode].heirarchyID;
156                                 //                      tree[currentNode].accessions.push_back(seqName);
157                                 
158                                 currentNode = tree[currentNode].children[taxon];
159                                 tree[currentNode].accessions.push_back(seqName);
160                                 name2Taxonomy[seqName] = currentNode;
161                                 //                      tree[currentNode].level = level;
162                                 //                      tree[currentNode].childNumber = numChildren;
163                                 //                      tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
164                         }
165                 
166                         if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
167                 }
168
169         }
170         catch(exception& e) {
171                 m->errorOut(e, "PhyloTree", "addSeqToTree");
172                 exit(1);
173         }
174 }
175 /**************************************************************************************************/
176 vector<int> PhyloTree::getGenusNodes()  {
177         try {
178                 genusIndex.clear();
179                 //generate genusIndexes
180                 map<int, int>::iterator it2;
181                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
182                 
183                 return genusIndex;
184         }
185         catch(exception& e) {
186                 m->errorOut(e, "PhyloTree", "getGenusNodes");
187                 exit(1);
188         }
189 }
190 /**************************************************************************************************/
191 vector<int> PhyloTree::getGenusTotals() {
192         try {
193         
194                 if (calcTotals) {
195                         totals.clear();
196                         //reset counts because we are on a new word
197                         for (int j = 0; j < genusIndex.size(); j++) {
198                                 totals.push_back(tree[genusIndex[j]].accessions.size());
199                         }
200                         return totals;
201                 }else{
202                         return totals;
203                 }
204                 
205         }
206         catch(exception& e) {
207                 m->errorOut(e, "PhyloTree", "getGenusNodes");
208                 exit(1);
209         }
210 }
211 /**************************************************************************************************/
212
213 void PhyloTree::assignHeirarchyIDs(int index){
214         try {
215                 map<string,int>::iterator it;
216                 int counter = 1;
217                 
218                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
219                         tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
220                         counter++;
221                         tree[it->second].level = tree[index].level + 1;
222                                                 
223                         //save maxLevel for binning the unclassified seqs
224                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
225                         
226                         assignHeirarchyIDs(it->second);
227                 }
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "PhyloTree", "assignHeirarchyIDs");
231                 exit(1);
232         }
233 }
234 /**************************************************************************************************/
235 void PhyloTree::binUnclassified(){
236         try {
237                 map<string, int>::iterator itBin;
238                 map<string, int>::iterator childPointer;
239                 
240                 //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
241                 for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) {
242                 
243                         int level = tree[itBin->second].level;
244                         int currentNode = itBin->second;
245                         
246                         //this sequence is unclassified at some levels
247                         while(level != maxLevel){
248                         
249                                 level++;
250                         
251                                 string taxon = "unclassified";  
252                                 
253                                 //does the parent have a child names 'unclassified'?
254                                 childPointer = tree[currentNode].children.find(taxon);
255                                 
256                                 if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
257                                         currentNode = childPointer->second; //currentNode becomes 'unclassified'
258                                         tree[currentNode].accessions.push_back(itBin->first);  //add this seq
259                                         name2Taxonomy[itBin->first] = currentNode;
260                                 }
261                                 else{                                                                                   //otherwise, create it
262                                         tree.push_back(TaxNode(taxon));
263                                         numNodes++;
264                                         tree[currentNode].children[taxon] = numNodes-1;
265                                         tree[numNodes-1].parent = currentNode;
266                                                                         
267                                         currentNode = tree[currentNode].children[taxon];
268                                         tree[currentNode].accessions.push_back(itBin->first);
269                                         name2Taxonomy[itBin->first] = currentNode;
270                                 }
271                                 
272                                 if (level == maxLevel) {   uniqueTaxonomies[currentNode] = currentNode; }
273                         }
274                 }
275                 
276                 //clear HeirarchyIDs and reset them
277                 for (int i = 1; i < tree.size(); i++) {
278                         tree[i].heirarchyID = "";
279                 }
280                 assignHeirarchyIDs(0);
281                 
282         }
283         catch(exception& e) {
284                 m->errorOut(e, "PhyloTree", "binUnclassified");
285                 exit(1);
286         }
287 }
288 /**************************************************************************************************/
289 string PhyloTree::getFullTaxonomy(string seqName) {
290         try {
291                 string tax = "";
292                 
293                 int currentNode = name2Taxonomy[seqName];
294                 
295                 while (tree[currentNode].parent != -1) {
296                         tax = tree[currentNode].name + ";" + tax;
297                         currentNode = tree[currentNode].parent;
298                 }
299                 
300                 return tax;
301         }
302         catch(exception& e) {
303                 m->errorOut(e, "PhyloTree", "getFullTaxonomy");
304                 exit(1);
305         }
306 }
307 /**************************************************************************************************/
308
309 void PhyloTree::print(ofstream& out){
310         try {
311                 out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
312                 print(0, out);
313         }
314         catch(exception& e) {
315                 m->errorOut(e, "PhyloTree", "print");
316                 exit(1);
317         }
318 }
319
320 /**************************************************************************************************/
321
322 void PhyloTree::print(int i, ofstream& out){
323         try {
324                 map<string,int>::iterator it;
325                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
326                         out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
327                         print(it->second, out);
328                 }
329         }
330         catch(exception& e) {
331                 m->errorOut(e, "PhyloTree", "print");
332                 exit(1);
333         }
334 }
335 /**************************************************************************************************/
336 void PhyloTree::printTreeNodes(string treefilename) {
337         try {
338                 ofstream outTree;
339                 openOutputFile(treefilename, outTree);
340                 
341                 //print treenodes
342                 outTree << tree.size() << endl;
343                 for (int i = 0; i < tree.size(); i++) {
344                         outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
345                 }
346                 
347                 //print genus nodes
348                 outTree << endl << uniqueTaxonomies.size() << endl;
349                 map<int, int>::iterator it2;
350                 for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
351                 outTree << endl;
352                 
353                 
354                 outTree.close();
355                 
356         }
357         catch(exception& e) {
358                 m->errorOut(e, "PhyloTree", "printTreeNodes");
359                 exit(1);
360         }
361 }
362 /**************************************************************************************************/
363
364
365