]> git.donarmstrong.com Git - mothur.git/blob - rawtrainingdatamaker.cpp
made classifier faster
[mothur.git] / rawtrainingdatamaker.cpp
1 /*
2  *  rawTrainingDataMaker.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/21/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "rawtrainingdatamaker.h"
11
12 /**************************************************************************************************/
13
14 RawTrainingDataMaker::RawTrainingDataMaker(){
15         try {
16                 m = MothurOut::getInstance();
17                 numNodes = 1;
18                 numSeqs = 0;
19                 tree.push_back(rawTaxNode("Root"));
20                 tree[0].rank = "Root";
21                 maxLevel = 0;
22         }
23         catch(exception& e) {
24                 m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
25                 exit(1);
26         }
27 }
28 /**************************************************************************************************/
29
30 RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
31         try {
32                 m = MothurOut::getInstance();
33                 numNodes = 1;
34                 numSeqs = 0;
35                 tree.push_back(rawTaxNode("Root"));
36                 tree[0].rank = "Root";
37                 maxLevel = 0;
38                 
39                 ifstream in;
40                 openInputFile(tfile, in);
41                 
42                 //read in users taxonomy file and add sequences to tree
43                 string name, tax;
44                 while(!in.eof()){
45                         in >> name >> tax; gobble(in);
46                         
47                         addSeqToTree(name, tax);
48                 }
49                 in.close();
50         
51                 assignRank(0);
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
55                 exit(1);
56         }
57 }
58
59 /**************************************************************************************************/
60
61 string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
62         try {
63                 string currentLevel = "";
64                 if(heirarchy != ""){
65                         int pos = heirarchy.find_first_of(';');
66                         currentLevel=heirarchy.substr(0,pos);
67                         if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
68                         else { heirarchy = ""; }
69                 }
70                 return currentLevel;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
74                 exit(1);
75         }
76 }
77
78 /**************************************************************************************************/
79
80 int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
81         try {
82                 numSeqs++;
83                 
84                 map<string, int>::iterator childPointer;
85                 
86                 int currentNode = 0;
87                 string taxon;
88                 
89                 while(seqTaxonomy != ""){
90                         
91                         if (m->control_pressed) { return 0; }
92                         
93                         //somehow the parent is getting one too many accnos
94                         //use print to reassign the taxa id
95                         taxon = getNextTaxon(seqTaxonomy);
96                         
97                         childPointer = tree[currentNode].children.find(taxon);
98                         
99                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
100                                 currentNode = childPointer->second;
101                         }else{                                                                                  //otherwise, create it
102                                 tree.push_back(rawTaxNode(taxon));
103                                 numNodes++;
104                                 tree[currentNode].children[taxon] = numNodes-1;
105                                 tree[numNodes-1].parent = currentNode;
106                                 
107                                 currentNode = tree[currentNode].children[taxon];
108                         }
109                 }
110
111         }
112         catch(exception& e) {
113                 m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
114                 exit(1);
115         }
116 }
117 /**************************************************************************************************/
118
119 void RawTrainingDataMaker::assignRank(int index){
120         try {
121                 map<string,int>::iterator it;
122                                 
123                 string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
124                 
125                 for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
126                         tree[it->second].level = tree[index].level + 1;
127                         
128                         if (tree[it->second].level > 8) { 
129                                 tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
130                         }else {
131                                 tree[it->second].rank = ranks[tree[it->second].level];
132                         }
133                                                 
134                         //save maxLevel for binning the unclassified seqs
135                         if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
136                         
137                         assignRank(it->second);
138                 }
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "RawTrainingDataMaker", "assignRank");
142                 exit(1);
143         }
144 }
145 /**************************************************************************************************/
146
147 void RawTrainingDataMaker::print(ofstream& out){
148         try {
149                 //string temp = tree[0].name +" " + tree[0].rank;
150                 //sanityCheck[temp] = temp;
151                 
152                 out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
153                 print(0, out);
154                 
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "RawTrainingDataMaker", "print");
158                 exit(1);
159         }
160 }
161
162 /**************************************************************************************************/
163
164 void RawTrainingDataMaker::print(int i, ofstream& out){
165         try {
166                 map<string,int>::iterator it;
167                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
168                         //string temp = tree[it->second].name + " " + tree[it->second].rank;
169                         
170                         //map<string, string>::iterator itSan;
171                         //itSan = sanityCheck.find(temp);
172                         
173                         //if (itSan == sanityCheck.end()) {
174                                 out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
175                                 //sanityCheck[temp] = temp;
176                         //}
177                         print(it->second, out);
178                 }
179         }
180         catch(exception& e) {
181                 m->errorOut(e, "RawTrainingDataMaker", "print");
182                 exit(1);
183         }
184 }
185 /**************************************************************************************************/
186
187
188