]> git.donarmstrong.com Git - mothur.git/blob - aligntree.cpp
changes while testing
[mothur.git] / aligntree.cpp
1 //
2 //  alignTree.cpp
3 //  pdsBayesian
4 //
5 //  Created by Patrick Schloss on 4/3/12.
6 //  Copyright (c) 2012 University of Michigan. All rights reserved.
7 //
8
9 #include "alignnode.h"
10 #include "aligntree.h"
11
12 /**************************************************************************************************/
13
14 AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
15         try {
16         AlignNode* newNode = new AlignNode("Root", 0);
17         tree.push_back(newNode);                        //      the tree is stored as a vector of elements of type TaxonomyNode
18         
19         string refTaxonomy;
20         
21         readTaxonomy(taxonomyFileName);
22      
23         ifstream referenceFile;
24         m->openInputFile(referenceFileName, referenceFile);
25         bool error = false;
26         map<int, int> lengths;
27         while(!referenceFile.eof()){
28             
29             if (m->control_pressed) { break; }
30             
31             Sequence seq(referenceFile);  m->gobble(referenceFile);
32             
33             if (seq.getName() != "") {
34                 map<string, string>::iterator it = taxonomy.find(seq.getName());
35                 
36                 if (it != taxonomy.end()) {
37                     refTaxonomy = it->second;           //      lookup the taxonomy string for the current reference sequence
38                     string aligned = seq.getAligned();
39                     lengths[aligned.length()] = 1;
40                     if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
41                     addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
42                 }else {
43                     m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
44                 }
45             }
46         }
47         referenceFile.close();
48         
49         length = (lengths.begin())->first;  
50            
51         if (error) { m->control_pressed = true; }
52         
53         numTaxa = (int)tree.size();
54         
55         numLevels = 0;
56         for(int i=0;i<numTaxa;i++){
57             int level = tree[i]->getLevel();
58             if(level > numLevels){      numLevels = level;      }
59         }
60         numLevels++;
61         
62         aggregateThetas();
63         
64         int dbSize = tree[0]->getNumSeqs();
65         
66         for(int i=0;i<numTaxa;i++){
67             tree[i]->checkTheta();
68             tree[i]->setTotalSeqs(dbSize);
69         }
70         
71     }
72     catch(exception& e) {
73         m->errorOut(e, "AlignTree", "AlignTree");
74         exit(1);
75     }
76 }
77
78 /**************************************************************************************************/
79
80 AlignTree::~AlignTree(){
81         try {
82         for(int i=0;i<tree.size();i++){
83             delete tree[i];
84         }
85         }
86     catch(exception& e) {
87         m->errorOut(e, "AlignTree", "~AlignTree");
88         exit(1);
89     }
90 }       
91
92 /**************************************************************************************************/
93
94 int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
95         try {
96         AlignNode* newNode;
97         string taxonName = "";
98         int treePosition = 0;                                                   //      the root is element 0
99         
100         int level = 1;
101         
102         for(int i=0;i<taxonomy.length();i++){                   //      step through taxonomy string...
103             
104             if (m->control_pressed) { break; }
105             
106             if(taxonomy[i] == ';'){                                             //      looking for semicolons...
107                 
108                 if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
109                 
110                 int newIndex = tree[treePosition]->getChildIndex(taxonName);    //      look to see if your current node already
111                 //      has a child with the new taxonName
112                 if(newIndex != -1)      {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
113                 else {                                                                                                          //       position in the tree
114                     int newChildIndex = (int)tree.size();                                               //      otherwise, we'll have to create one...
115                     tree[treePosition]->makeChild(taxonName, newChildIndex);
116                     
117                     newNode = new AlignNode(taxonName, level);
118                     
119                     newNode->setParent(treePosition);
120                     
121                     tree.push_back(newNode);
122                     treePosition = newChildIndex;
123                 }
124                 
125                 //      sequence data to that node to update that node's theta - seems slow...                          
126                 taxonName = "";                                                         //      clear out the taxon name that we will build as we look 
127                 level++;
128             }                                                                                           //      for a semicolon
129             else{
130                 taxonName += taxonomy[i];                                       //      keep adding letters until we reach a semicolon
131             }
132         }
133         tree[treePosition]->loadSequence(sequence);     //      now that we've gotten to the correct node, add the
134         
135         return 0;
136         }
137     catch(exception& e) {
138         m->errorOut(e, "AlignTree", "addTaxonomyToTree");
139         exit(1);
140     }
141 }
142
143 /**************************************************************************************************/
144
145 int AlignTree::aggregateThetas(){
146         try {
147         vector<vector<int> > levelMatrix(numLevels+1);
148         
149         for(int i=0;i<tree.size();i++){
150             if (m->control_pressed) { return 0; }
151             levelMatrix[tree[i]->getLevel()].push_back(i);
152         }
153                 
154         for(int i=numLevels-1;i>0;i--){
155             if (m->control_pressed) { return 0; }
156             for(int j=0;j<levelMatrix[i].size();j++){
157                 
158                 AlignNode* holder = tree[levelMatrix[i][j]];
159                 
160                 tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                         
161             }
162         }
163             return 0;
164         }
165     catch(exception& e) {
166         m->errorOut(e, "AlignTree", "aggregateThetas");
167         exit(1);
168     }
169 }
170
171 /**************************************************************************************************/
172
173 double AlignTree::getOutlierLogProbability(string& sequence){
174         try {
175         double count = 0;
176         
177         for(int i=0;i<sequence.length();i++){
178             
179             if(sequence[i] != '.'){     count++;        }
180             
181         }
182         
183         return count * log(0.2);
184     }
185     catch(exception& e) {
186         m->errorOut(e, "AlignTree", "getOutlierLogProbability");
187         exit(1);
188     }
189 }
190
191 /**************************************************************************************************/
192
193 int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
194         try {
195         int numProbs = (int)probabilities.size();
196         
197         vector<double> G(numProbs, 0.2);        //a random sequence will, on average, be 20% similar to any other sequence
198         vector<double> risk(numProbs, 0);
199         
200         for(int i=1;i<numProbs;i++){ //use if you want the outlier group
201             if (m->control_pressed) { return 0; }
202             G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
203         }
204         
205         double minRisk = 1e6;
206         int minRiskIndex = 0;
207         
208         for(int i=0;i<numProbs;i++){
209             if (m->control_pressed) { return 0; }
210             for(int j=0;j<numProbs;j++){
211                 if(i != j){
212                     risk[i] += probabilities[j] * G[j];
213                 }                       
214             }
215             
216             if(risk[i] < minRisk){
217                 minRisk = risk[i];
218                 minRiskIndex = i;
219             }
220         }
221         
222         return minRiskIndex;
223     }
224     catch(exception& e) {
225         m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
226         exit(1);
227     }
228
229 }
230
231 /**************************************************************************************************/
232
233 int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
234         try {
235         int finalLevel = (int)indices.size()-1;
236         
237         for(int position=1;position<indices.size();position++){
238             if (m->control_pressed) { return 0; }
239             int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
240             int actualParent = indices[position-1][maxIndices[position-1]];
241             
242             if(predictedParent != actualParent){
243                 finalLevel = position - 1;
244                 return finalLevel;
245             }
246         }
247         return finalLevel;
248         }
249     catch(exception& e) {
250         m->errorOut(e, "AlignTree", "sanityCheck");
251         exit(1);
252     }
253 }
254
255 /**************************************************************************************************/
256
257 string AlignTree::getTaxonomy(Sequence* seq){
258     try {
259         string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
260         if (querySequence.length() != length) {
261             m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return "";
262         }
263         double logPOutlier = getOutlierLogProbability(querySequence);
264         
265         vector<vector<double> > pXgivenKj_D_j(numLevels);
266         vector<vector<int> > indices(numLevels);
267         for(int i=0;i<numLevels;i++){
268             if (m->control_pressed) { return taxonProbabilityString; }
269             pXgivenKj_D_j[i].push_back(logPOutlier);
270             indices[i].push_back(-1);
271         }
272         
273         
274         for(int i=0;i<numTaxa;i++){
275             //          cout << i << '\t' << tree[i]->getName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl;
276             if (m->control_pressed) { return taxonProbabilityString; }
277             pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
278             indices[tree[i]->getLevel()].push_back(i);
279         }
280         
281         vector<double> sumLikelihood(numLevels, 0);
282         vector<double> bestPosterior(numLevels, 0);
283         vector<int> maxIndex(numLevels, 0);
284         int maxPosteriorIndex;
285         
286         
287                 //cout << "before best level" << endl;
288         
289         //let's find the best level and taxa within that level
290         for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
291             if (m->control_pressed) { return taxonProbabilityString; }
292             int numTaxaInLevel = (int)indices[i].size();
293             
294                         //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl;
295             
296             vector<double> posteriors(numTaxaInLevel, 0);               
297             sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
298             
299             maxPosteriorIndex = 0;
300             for(int j=0;j<numTaxaInLevel;j++){
301                 posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
302                 
303                 if(posteriors[j] > posteriors[maxPosteriorIndex]){      
304                     maxPosteriorIndex = j;
305                 }
306                 
307             }
308             
309             maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
310             
311             maxIndex[i] = maxPosteriorIndex;
312             bestPosterior[i] = posteriors[maxIndex[i]]; 
313         }
314         
315         //      vector<double> pX_level(numLevels, 0);
316         //      
317         //      for(int i=0;i<numLevels;i++){
318         //              pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
319         //      }
320         //      
321         //      int max_pLevel_X_index = -1;
322         //      double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
323         //      double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
324         //      
325         //      vector<double> pLevel_X(numLevels, 0);
326         //      for(int i=0;i<numLevels;i++){
327         //              pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
328         //      }
329         
330         
331         
332         
333         int saneDepth = sanityCheck(indices, maxIndex);
334         
335         simpleTax = "";
336         int savedspot = 1;
337         taxonProbabilityString = "";
338         for(int i=1;i<=saneDepth;i++){
339             if (m->control_pressed) { return taxonProbabilityString; }
340             int confidenceScore = (int) (bestPosterior[i] * 100);
341             if (confidenceScore >= confidenceThreshold) {
342             if(indices[i][maxIndex[i]] != -1){
343                 taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
344                 simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
345                 //                      levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
346             }
347             else{
348                 taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
349                 //                      levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
350                 simpleTax += "unclassified;";
351             }
352             }else { break; }
353             savedspot = i;
354         }
355         
356         for(int i=savedspot+1;i<numLevels;i++){
357             if (m->control_pressed) { return taxonProbabilityString; }
358             taxonProbabilityString + "unclassified(0);";
359             simpleTax += "unclassified;";
360         }
361         
362         return taxonProbabilityString;
363         }
364     catch(exception& e) {
365         m->errorOut(e, "AlignTree", "getTaxonomy");
366         exit(1);
367     }
368 }
369
370
371 /**************************************************************************************************/