]> git.donarmstrong.com Git - mothur.git/blob - kmertree.cpp
fixes while testing 1.33.0
[mothur.git] / kmertree.cpp
1 //
2 //  kmerTree.cpp
3 //  pdsBayesian
4 //
5 //  Created by Patrick Schloss on 4/3/12.
6 //  Copyright (c) 2012 University of Michigan. All rights reserved.
7 //
8
9 #include "kmernode.h"
10 #include "kmertree.h"
11
12 /**************************************************************************************************/
13
14 KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){
15         try {
16         KmerNode* newNode = new KmerNode("Root", 0, kmerSize);
17         tree.push_back(newNode);                        //      the tree is stored as a vector of elements of type TaxonomyNode
18         
19         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
20         numPossibleKmers = power4s[kmerSize];
21         
22         string refTaxonomy;
23         
24         readTaxonomy(taxonomyFileName);
25         
26         ifstream referenceFile;
27         m->openInputFile(referenceFileName, referenceFile);
28         bool error = false;
29         while(!referenceFile.eof()){
30             
31             if (m->control_pressed) { break; }
32             
33             Sequence seq(referenceFile);  m->gobble(referenceFile);
34             
35             if (seq.getName() != "") {
36                 map<string, string>::iterator it = taxonomy.find(seq.getName());
37                 
38                 if (it != taxonomy.end()) {
39                     refTaxonomy = it->second;           //      lookup the taxonomy string for the current reference sequence
40                     vector<int> kmerProfile = ripKmerProfile(seq.getUnaligned());       //convert to kmer vector
41                     addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile);
42                 }else {
43                     m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
44                 }
45             }
46         }
47         referenceFile.close();
48         
49         if (error) { m->control_pressed = true; }
50         
51         numTaxa = (int)tree.size();
52         numLevels = 0;
53         for(int i=0;i<numTaxa;i++){
54             int level = tree[i]->getLevel();
55             if(level > numLevels){      numLevels = level;      }
56         }
57         numLevels++;
58         
59         aggregateThetas();
60         
61         int dbSize = tree[0]->getNumSeqs();
62         
63         for(int i=0;i<numTaxa;i++){
64             tree[i]->checkTheta();
65             tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers());
66             tree[i]->setTotalSeqs(dbSize);
67         }
68     }
69         catch(exception& e) {
70                 m->errorOut(e, "KmerTree", "KmerTree");
71                 exit(1);
72         }
73 }
74
75 /**************************************************************************************************/
76
77 KmerTree::~KmerTree(){
78         
79         for(int i=0;i<tree.size();i++){
80                 delete tree[i];
81         }
82         
83 }       
84 /**********************************************************************************************************************/
85
86 vector<int> KmerTree::ripKmerProfile(string sequence){
87     try {
88         //      assume all input sequences are unaligned
89         
90         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
91         
92         int nKmers = (int)sequence.length() - kmerSize + 1;
93         
94         vector<int> kmerProfile(numPossibleKmers + 1, 0);
95         
96         for(int i=0;i<nKmers;i++){
97             
98             if (m->control_pressed) { break; }
99             
100             int kmer = 0;
101             for(int j=0;j<kmerSize;j++){
102                 if(toupper(sequence[j+i]) == 'A')               {       kmer += (0 * power4s[kmerSize-j-1]);    }
103                 else if(toupper(sequence[j+i]) == 'C')  {       kmer += (1 * power4s[kmerSize-j-1]);    }
104                 else if(toupper(sequence[j+i]) == 'G')  {       kmer += (2 * power4s[kmerSize-j-1]);    }
105                 else if(toupper(sequence[j+i]) == 'U')  {       kmer += (3 * power4s[kmerSize-j-1]);    }
106                 else if(toupper(sequence[j+i]) == 'T')  {       kmer += (3 * power4s[kmerSize-j-1]);    }
107                 else                                                                    {       kmer = power4s[kmerSize]; j = kmerSize; }
108             }
109             kmerProfile[kmer] = 1;
110         }
111         
112         return kmerProfile;     
113     }
114         catch(exception& e) {
115                 m->errorOut(e, "KmerTree", "ripKmerProfile");
116                 exit(1);
117         }
118 }
119
120 /**************************************************************************************************/
121
122 int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
123         try {
124         KmerNode* newNode;
125         string taxonName = "";
126         int treePosition = 0;                                                   //      the root is element 0
127         
128         
129         int level = 1;
130         
131         for(int i=0;i<taxonomy.length();i++){                   //      step through taxonomy string...
132             
133             if (m->control_pressed) { break; }
134             if(taxonomy[i] == ';'){                                             //      looking for semicolons...
135                 
136                 if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
137                 
138                 int newIndex = tree[treePosition]->getChildIndex(taxonName);//  look to see if your current node already
139                 //         has a child with the new taxonName
140                 if(newIndex != -1)      {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
141                 else {                                                                                                          //         position in the tree
142                     int newChildIndex = (int)tree.size();                                       //      otherwise, we'll have to create one...
143                     tree[treePosition]->makeChild(taxonName, newChildIndex);
144                     
145                     newNode = new KmerNode(taxonName, level, kmerSize);
146                     
147                     newNode->setParent(treePosition);
148                     
149                     tree.push_back(newNode);
150                     treePosition = newChildIndex;
151                 }
152                 
153                 //      sequence data to that node to update that node's theta - seems slow...                          
154                 taxonName = "";                                                         //      clear out the taxon name that we will build as we look 
155                 level++;
156                 
157             }                                                                                           //      for a semicolon
158             else{
159                 taxonName += taxonomy[i];                                       //      keep adding letters until we reach a semicolon
160             }
161         }
162         
163         tree[treePosition]->loadSequence(sequence);     //      now that we've gotten to the correct node, add the
164         
165         return 0;
166     }
167         catch(exception& e) {
168                 m->errorOut(e, "KmerTree", "addTaxonomyToTree");
169                 exit(1);
170         }
171         
172 }
173
174 /**************************************************************************************************/
175
176 int KmerTree::aggregateThetas(){
177         try {
178         vector<vector<int> > levelMatrix(numLevels+1);
179         
180         for(int i=0;i<tree.size();i++){
181             if (m->control_pressed) { return 0; }
182             levelMatrix[tree[i]->getLevel()].push_back(i);
183         }
184         
185         for(int i=numLevels-1;i>0;i--) {
186             if (m->control_pressed) { return 0; }
187             
188             for(int j=0;j<levelMatrix[i].size();j++){
189                 
190                 KmerNode* holder = tree[levelMatrix[i][j]];
191                 
192                 tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                         
193             }
194         }
195         
196         return 0;
197         }
198         catch(exception& e) {
199                 m->errorOut(e, "KmerTree", "aggregateThetas");
200                 exit(1);
201         }
202 }
203
204 /**************************************************************************************************/
205
206 int KmerTree::getMinRiskIndexKmer(vector<int>& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
207         try {
208         int numProbs = (int)probabilities.size();
209         
210         vector<double> G(numProbs, 0.2);        //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever.
211         vector<double> risk(numProbs, 0);
212         
213         for(int i=1;i<numProbs;i++){ //use if you want the outlier group
214             if (m->control_pressed) { return 0; }
215             G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
216         }
217         
218         double minRisk = 1e6;
219         int minRiskIndex = 0;
220         
221         for(int i=0;i<numProbs;i++){
222             if (m->control_pressed) { return 0; }
223             for(int j=0;j<numProbs;j++){
224                 if(i != j){
225                     risk[i] += probabilities[j] * G[j];
226                 }                       
227             }
228             
229             if(risk[i] < minRisk){
230                 minRisk = risk[i];
231                 minRiskIndex = i;
232             }
233         }
234         
235         return minRiskIndex;
236     }
237         catch(exception& e) {
238                 m->errorOut(e, "KmerTree", "getMinRiskIndexKmer");
239                 exit(1);
240         }
241 }
242
243 /**************************************************************************************************/
244
245 int KmerTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
246         try {
247         int finalLevel = (int)indices.size()-1;
248         
249         for(int position=1;position<indices.size();position++){
250             if (m->control_pressed) { return 0; }
251             int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
252             int actualParent = indices[position-1][maxIndices[position-1]];
253             
254             if(predictedParent != actualParent){
255                 finalLevel = position - 1;
256                 return finalLevel;
257             }
258         }
259         return finalLevel;
260         }
261         catch(exception& e) {
262                 m->errorOut(e, "KmerTree", "sanityCheck");
263                 exit(1);
264         }
265 }
266
267 /**************************************************************************************************/
268 string KmerTree::getTaxonomy(Sequence* thisSeq){
269         try {
270         string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = "";
271         string unalignedSeq = thisSeq->getUnaligned();
272         
273         double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers());
274         
275         vector<int> queryProfile = ripKmerProfile(unalignedSeq);        //convert to kmer vector
276         
277         vector<vector<double> > pXgivenKj_D_j(numLevels);
278         vector<vector<int> > indices(numLevels);
279         for(int i=0;i<numLevels;i++){
280             if (m->control_pressed) { return taxonProbabilityString; }
281             pXgivenKj_D_j[i].push_back(logPOutlier);
282             indices[i].push_back(-1);
283         }
284         
285         for(int i=0;i<numTaxa;i++){
286             if (m->control_pressed) { return taxonProbabilityString; }
287             pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile));
288             indices[tree[i]->getLevel()].push_back(i);
289         }
290         
291         vector<double> sumLikelihood(numLevels, 0);
292         vector<double> bestPosterior(numLevels, 0);
293         vector<int> maxIndex(numLevels, 0);
294         int maxPosteriorIndex;
295         
296         //let's find the best level and taxa within that level
297         for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
298             if (m->control_pressed) { return taxonProbabilityString; }
299             
300             int numTaxaInLevel = (int)indices[i].size();
301             
302             vector<double> posteriors(numTaxaInLevel, 0);               
303             sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
304             
305             maxPosteriorIndex = 0;
306             for(int j=0;j<numTaxaInLevel;j++){
307                 posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
308                 if(posteriors[j] > posteriors[maxPosteriorIndex]){      
309                     maxPosteriorIndex = j;
310                 }
311                 
312             }
313             
314             maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors);
315             
316             maxIndex[i] = maxPosteriorIndex;
317             bestPosterior[i] = posteriors[maxIndex[i]]; 
318         }
319         
320         //      vector<double> pX_level(numLevels, 0);
321         //      
322         //      for(int i=0;i<numLevels;i++){
323         //              pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
324         //      }
325         //      
326         //      int max_pLevel_X_index = -1;
327         //      double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
328         //      double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
329         //      
330         //      vector<double> pLevel_X(numLevels, 0);
331         //      for(int i=0;i<numLevels;i++){
332         //              pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
333         //      }
334         
335         int saneDepth = sanityCheck(indices, maxIndex);
336         
337         
338         //      stringstream levelProbabilityOutput;
339         //      levelProbabilityOutput.setf(ios::fixed, ios::floatfield);
340         //      levelProbabilityOutput.setf(ios::showpoint);
341         
342         
343         //taxonProbabilityOutput << seqName << '\t';
344         //      taxonProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
345         //      levelProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
346         simpleTax = "";
347         int savedspot = 1;
348         taxonProbabilityString = "";
349         for(int i=1;i<=saneDepth;i++){
350             if (m->control_pressed) { return taxonProbabilityString; }
351             int confidenceScore = (int) (bestPosterior[i] * 100);
352             if (confidenceScore >= confidenceThreshold) {
353                 if(indices[i][maxIndex[i]] != -1){
354                     taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");";
355                     simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
356                     
357                     //                  levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
358                 }
359                 else{
360                     taxonProbabilityString += "unclassified(" + toString(confidenceScore) + ");";
361                     //                  levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
362                     simpleTax += "unclassified;";
363                 }
364             }else { break; }
365             savedspot = i;
366         }
367         
368         
369         
370         for(int i=savedspot+1;i<numLevels;i++){
371             if (m->control_pressed) { return taxonProbabilityString; }
372             taxonProbabilityString += "unclassified(0);";
373             simpleTax += "unclassified;";
374         }
375         
376         return taxonProbabilityString;
377         }
378         catch(exception& e) {
379                 m->errorOut(e, "KmerTree", "getTaxonomy");
380                 exit(1);
381         }
382 }
383
384
385 /**************************************************************************************************/
386