5 // Created by Patrick Schloss on 4/3/12.
6 // Copyright (c) 2012 University of Michigan. All rights reserved.
12 /**************************************************************************************************/
14 KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){
16 KmerNode* newNode = new KmerNode("Root", 0, kmerSize);
17 tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode
19 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
20 numPossibleKmers = power4s[kmerSize];
24 readTaxonomy(taxonomyFileName);
26 ifstream referenceFile;
27 m->openInputFile(referenceFileName, referenceFile);
29 while(!referenceFile.eof()){
31 if (m->control_pressed) { break; }
33 Sequence seq(referenceFile); m->gobble(referenceFile);
35 if (seq.getName() != "") {
36 map<string, string>::iterator it = taxonomy.find(seq.getName());
38 if (it != taxonomy.end()) {
39 refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence
40 vector<int> kmerProfile = ripKmerProfile(seq.getUnaligned()); //convert to kmer vector
41 addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile);
43 m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
47 referenceFile.close();
49 if (error) { m->control_pressed = true; }
51 numTaxa = (int)tree.size();
53 for(int i=0;i<numTaxa;i++){
54 int level = tree[i]->getLevel();
55 if(level > numLevels){ numLevels = level; }
61 int dbSize = tree[0]->getNumSeqs();
63 for(int i=0;i<numTaxa;i++){
64 tree[i]->checkTheta();
65 tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers());
66 tree[i]->setTotalSeqs(dbSize);
70 m->errorOut(e, "KmerTree", "KmerTree");
75 /**************************************************************************************************/
77 KmerTree::~KmerTree(){
79 for(int i=0;i<tree.size();i++){
84 /**********************************************************************************************************************/
86 vector<int> KmerTree::ripKmerProfile(string sequence){
88 // assume all input sequences are unaligned
90 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
92 int nKmers = (int)sequence.length() - kmerSize + 1;
94 vector<int> kmerProfile(numPossibleKmers + 1, 0);
96 for(int i=0;i<nKmers;i++){
98 if (m->control_pressed) { break; }
101 for(int j=0;j<kmerSize;j++){
102 if(toupper(sequence[j+i]) == 'A') { kmer += (0 * power4s[kmerSize-j-1]); }
103 else if(toupper(sequence[j+i]) == 'C') { kmer += (1 * power4s[kmerSize-j-1]); }
104 else if(toupper(sequence[j+i]) == 'G') { kmer += (2 * power4s[kmerSize-j-1]); }
105 else if(toupper(sequence[j+i]) == 'U') { kmer += (3 * power4s[kmerSize-j-1]); }
106 else if(toupper(sequence[j+i]) == 'T') { kmer += (3 * power4s[kmerSize-j-1]); }
107 else { kmer = power4s[kmerSize]; j = kmerSize; }
109 kmerProfile[kmer] = 1;
114 catch(exception& e) {
115 m->errorOut(e, "KmerTree", "ripKmerProfile");
120 /**************************************************************************************************/
122 int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
125 string taxonName = "";
126 int treePosition = 0; // the root is element 0
131 for(int i=0;i<taxonomy.length();i++){ // step through taxonomy string...
133 if (m->control_pressed) { break; }
134 if(taxonomy[i] == ';'){ // looking for semicolons...
136 if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
138 int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already
139 // has a child with the new taxonName
140 if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that
141 else { // position in the tree
142 int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one...
143 tree[treePosition]->makeChild(taxonName, newChildIndex);
145 newNode = new KmerNode(taxonName, level, kmerSize);
147 newNode->setParent(treePosition);
149 tree.push_back(newNode);
150 treePosition = newChildIndex;
153 // sequence data to that node to update that node's theta - seems slow...
154 taxonName = ""; // clear out the taxon name that we will build as we look
159 taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon
163 tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the
167 catch(exception& e) {
168 m->errorOut(e, "KmerTree", "addTaxonomyToTree");
174 /**************************************************************************************************/
176 int KmerTree::aggregateThetas(){
178 vector<vector<int> > levelMatrix(numLevels+1);
180 for(int i=0;i<tree.size();i++){
181 if (m->control_pressed) { return 0; }
182 levelMatrix[tree[i]->getLevel()].push_back(i);
185 for(int i=numLevels-1;i>0;i--) {
186 if (m->control_pressed) { return 0; }
188 for(int j=0;j<levelMatrix[i].size();j++){
190 KmerNode* holder = tree[levelMatrix[i][j]];
192 tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());
198 catch(exception& e) {
199 m->errorOut(e, "KmerTree", "aggregateThetas");
204 /**************************************************************************************************/
206 int KmerTree::getMinRiskIndexKmer(vector<int>& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
208 int numProbs = (int)probabilities.size();
210 vector<double> G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever.
211 vector<double> risk(numProbs, 0);
213 for(int i=1;i<numProbs;i++){ //use if you want the outlier group
214 if (m->control_pressed) { return 0; }
215 G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
218 double minRisk = 1e6;
219 int minRiskIndex = 0;
221 for(int i=0;i<numProbs;i++){
222 if (m->control_pressed) { return 0; }
223 for(int j=0;j<numProbs;j++){
225 risk[i] += probabilities[j] * G[j];
229 if(risk[i] < minRisk){
237 catch(exception& e) {
238 m->errorOut(e, "KmerTree", "getMinRiskIndexKmer");
243 /**************************************************************************************************/
245 int KmerTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
247 int finalLevel = (int)indices.size()-1;
249 for(int position=1;position<indices.size();position++){
250 if (m->control_pressed) { return 0; }
251 int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
252 int actualParent = indices[position-1][maxIndices[position-1]];
254 if(predictedParent != actualParent){
255 finalLevel = position - 1;
261 catch(exception& e) {
262 m->errorOut(e, "KmerTree", "sanityCheck");
267 /**************************************************************************************************/
268 string KmerTree::getTaxonomy(Sequence* thisSeq){
270 string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = "";
271 string unalignedSeq = thisSeq->getUnaligned();
273 double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers());
275 vector<int> queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector
277 vector<vector<double> > pXgivenKj_D_j(numLevels);
278 vector<vector<int> > indices(numLevels);
279 for(int i=0;i<numLevels;i++){
280 if (m->control_pressed) { return taxonProbabilityString; }
281 pXgivenKj_D_j[i].push_back(logPOutlier);
282 indices[i].push_back(-1);
285 for(int i=0;i<numTaxa;i++){
286 if (m->control_pressed) { return taxonProbabilityString; }
287 pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile));
288 indices[tree[i]->getLevel()].push_back(i);
291 vector<double> sumLikelihood(numLevels, 0);
292 vector<double> bestPosterior(numLevels, 0);
293 vector<int> maxIndex(numLevels, 0);
294 int maxPosteriorIndex;
296 //let's find the best level and taxa within that level
297 for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
298 if (m->control_pressed) { return taxonProbabilityString; }
300 int numTaxaInLevel = (int)indices[i].size();
302 vector<double> posteriors(numTaxaInLevel, 0);
303 sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
305 maxPosteriorIndex = 0;
306 for(int j=0;j<numTaxaInLevel;j++){
307 posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
308 if(posteriors[j] > posteriors[maxPosteriorIndex]){
309 maxPosteriorIndex = j;
314 maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors);
316 maxIndex[i] = maxPosteriorIndex;
317 bestPosterior[i] = posteriors[maxIndex[i]];
320 // vector<double> pX_level(numLevels, 0);
322 // for(int i=0;i<numLevels;i++){
323 // pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
326 // int max_pLevel_X_index = -1;
327 // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
328 // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
330 // vector<double> pLevel_X(numLevels, 0);
331 // for(int i=0;i<numLevels;i++){
332 // pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
335 int saneDepth = sanityCheck(indices, maxIndex);
338 // stringstream levelProbabilityOutput;
339 // levelProbabilityOutput.setf(ios::fixed, ios::floatfield);
340 // levelProbabilityOutput.setf(ios::showpoint);
343 //taxonProbabilityOutput << seqName << '\t';
344 // taxonProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
345 // levelProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
348 taxonProbabilityString = "";
349 for(int i=1;i<=saneDepth;i++){
350 if (m->control_pressed) { return taxonProbabilityString; }
351 int confidenceScore = (int) (bestPosterior[i] * 100);
352 if (confidenceScore >= confidenceThreshold) {
353 if(indices[i][maxIndex[i]] != -1){
354 taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");";
355 simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
357 // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
360 taxonProbabilityString += "unclassified(" + toString(confidenceScore) + ");";
361 // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
362 simpleTax += "unclassified;";
370 for(int i=savedspot+1;i<numLevels;i++){
371 if (m->control_pressed) { return taxonProbabilityString; }
372 taxonProbabilityString += "unclassified(0);";
373 simpleTax += "unclassified;";
376 return taxonProbabilityString;
378 catch(exception& e) {
379 m->errorOut(e, "KmerTree", "getTaxonomy");
385 /**************************************************************************************************/