From: Sarah Westcott Date: Wed, 3 Oct 2012 18:51:10 +0000 (-0400) Subject: added zap method to classify.seqs and changed bayesian method name to wang. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=e8e13c129ba8184ec5932a090773f353f3ae3406 added zap method to classify.seqs and changed bayesian method name to wang. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 9d6261b..b2895e8 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -19,6 +19,11 @@ A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; }; A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; }; A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; }; + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; }; + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; }; + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; }; + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; }; + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; }; A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; }; @@ -395,6 +400,16 @@ A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = ""; }; A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = ""; }; A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = ""; }; + A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = ""; }; + A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = ""; }; + A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = ""; }; + A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = ""; }; + A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = ""; }; + A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = ""; }; + A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = ""; }; + A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = ""; }; + A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = ""; }; + A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = ""; }; A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = ""; }; A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; @@ -1775,10 +1790,18 @@ A7E9BA4B12D3966900DA6239 /* classifier */ = { isa = PBXGroup; children = ( - A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, + A721AB67161C570F009860A1 /* alignnode.h */, + A721AB66161C570F009860A1 /* alignnode.cpp */, + A721AB69161C570F009860A1 /* aligntree.h */, + A721AB68161C570F009860A1 /* aligntree.cpp */, A7E9B65B12D37EC300DA6239 /* bayesian.h */, + A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, A7E9B68E12D37EC400DA6239 /* classify.cpp */, A7E9B68F12D37EC400DA6239 /* classify.h */, + A721AB6E161C572A009860A1 /* kmernode.h */, + A721AB6D161C572A009860A1 /* kmernode.cpp */, + A721AB70161C572A009860A1 /* kmertree.h */, + A721AB6F161C572A009860A1 /* kmertree.cpp */, A7E9B73812D37EC400DA6239 /* knn.h */, A7E9B73712D37EC400DA6239 /* knn.cpp */, A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */, @@ -1787,6 +1810,8 @@ A7E9B79012D37EC400DA6239 /* phylotree.h */, A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */, A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */, + A721AB74161C573B009860A1 /* taxonomynode.h */, + A721AB73161C573B009860A1 /* taxonomynode.cpp */, ); name = classifier; sourceTree = ""; @@ -2250,6 +2275,11 @@ A7386C29161A110800651424 /* decisiontree.cpp in Sources */, A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */, A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */, + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */, + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */, + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */, + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */, + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/alignnode.cpp b/alignnode.cpp new file mode 100755 index 0000000..ccd8fb0 --- /dev/null +++ b/alignnode.cpp @@ -0,0 +1,257 @@ +/* + * alignNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "alignNode.h" +#include "taxonomynode.h" + +#include "bayesian.h" + +/**************************************************************************************************/ + +AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){ + + alignLength = 0; +} + +/**************************************************************************************************/ + +void AlignNode::printTheta(){ + try { + m->mothurOut("A:\t"); for(int i=0;imothurOut(toString(theta[i].A)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("T:\t"); for(int i=0;imothurOut(toString(theta[i].T)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("G:\t"); for(int i=0;imothurOut(toString(theta[i].G)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("C:\t"); for(int i=0;imothurOut(toString(theta[i].C)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("I:\t"); for(int i=0;imothurOut(toString(theta[i].gap)+ '\t'); } m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "printTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::loadSequence(string& sequence){ + try { + alignLength = (int)sequence.length(); // this function runs through the alignment and increments the frequency + // of each base for a particular taxon. we are building the thetas + + if(theta.size() == 0){ + theta.resize(alignLength); + columnCounts.resize(alignLength, 0); + } + + for(int i=0;icontrol_pressed) { return 0; } + + char base = sequence[i]; + + if(base == 'A') { theta[i].A++; columnCounts[i]++; } // our thetas will be alignLength x 5 + else if(base == 'T'){ theta[i].T++; columnCounts[i]++; } // and we ignore any position that has + else if(base == 'G'){ theta[i].G++; columnCounts[i]++; } // an ambiguous base call + else if(base == 'C'){ theta[i].C++; columnCounts[i]++; } + else if(base == '-'){ theta[i].gap++; columnCounts[i]++; } + else if(base == 'U'){ theta[i].T++; columnCounts[i]++; } + } + + numSeqs++; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "loadSequence"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::checkTheta(){ + try { + for(int i=0;icontrol_pressed) { return 0; } + + if(theta[i].gap == columnCounts[i]){ + columnCounts[i] = 0; + } + // else{ + // int maxCount = theta[i].A; + // + // if(theta[i].T > maxCount) { maxCount = theta[i].T; } + // if(theta[i].G > maxCount) { maxCount = theta[i].T; } + // if(theta[i].C > maxCount) { maxCount = theta[i].T; } + // if(theta[i].gap > maxCount) { maxCount = theta[i].T; } + // + // if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){ //remove any column where the maximum frequency is <50% + // columnCounts[i] = 0; + // } + // } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "checkTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::addThetas(vector newTheta, int newNumSeqs){ + try { + if(alignLength == 0){ + alignLength = (int)newTheta.size(); + theta.resize(alignLength); + columnCounts.resize(alignLength); + } + + for(int i=0;icontrol_pressed) { return 0; } + + theta[i].A += newTheta[i].A; columnCounts[i] += newTheta[i].A; + theta[i].T += newTheta[i].T; columnCounts[i] += newTheta[i].T; + theta[i].G += newTheta[i].G; columnCounts[i] += newTheta[i].G; + theta[i].C += newTheta[i].C; columnCounts[i] += newTheta[i].C; + theta[i].gap += newTheta[i].gap; columnCounts[i] += newTheta[i].gap; + } + + numSeqs += newNumSeqs; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "addThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getSimToConsensus(string& query){ + try { + double similarity = 0; + + int length = 0; + + for(int i=0;icontrol_pressed) { return similarity; } + + char base = query[i]; + + if(base != '.' && base != 'N' && columnCounts[i] != 0){ + + double fraction = 0; + + if(base == 'A'){ + fraction = (int) theta[i].A / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'T'){ + fraction = (int) theta[i].T / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'G'){ + fraction = (int) theta[i].G / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'C'){ + fraction = (int) theta[i].C / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == '-'){ + fraction = (int) theta[i].gap / (double) columnCounts[i]; + similarity += fraction; + length++; + } + } + } + + if(length != 0){ + similarity /= double(length); + } + else { + similarity = 0; + } + + return similarity; + + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getSimToConsensus"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getPxGivenkj_D_j(string& query){ //P(x | k_j, D, j) + try { + double PxGivenkj_D_j = 0; + + int count = 0; + double alpha = 1 / (double)totalSeqs; //flat prior + + + for(int s=0;scontrol_pressed) { return PxGivenkj_D_j; } + + char base = query[s]; + thetaAlign thetaS = theta[s]; + + if(base != '.' && base != 'N' && columnCounts[s] != 0){ + double Nkj_s = (double)columnCounts[s]; + double nkj_si = 0; + + + if(base == 'A') { nkj_si = (double)thetaS.A; } + else if(base == 'T'){ nkj_si = (double)thetaS.T; } + else if(base == 'G'){ nkj_si = (double)thetaS.G; } + else if(base == 'C'){ nkj_si = (double)thetaS.C; } + else if(base == '-'){ nkj_si = (double)thetaS.gap; } + else if(base == 'U'){ nkj_si = (double)thetaS.T; } + + // double alpha = pow(0.2, double(Nkj_s)) + 0.0001; //need to make 1e-4 a variable in future; this is the non-flat prior + + // if(columnCounts[s] != nkj_si){ //deal only with segregating sites... + double numerator = nkj_si + alpha; + double denomenator = Nkj_s + 5.0 * alpha; + + PxGivenkj_D_j += log(numerator) - log(denomenator); + count++; + // } + } + if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){ + count = 0; + break; + } + + } + + if(count == 0){ PxGivenkj_D_j = -1e10; } + + return PxGivenkj_D_j; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getPxGivenkj_D_j"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/alignnode.h b/alignnode.h new file mode 100755 index 0000000..4aecca7 --- /dev/null +++ b/alignnode.h @@ -0,0 +1,49 @@ +#ifndef ALIGNNODE +#define ALIGNNODE + +/* + * alignNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +struct thetaAlign { + thetaAlign() : A(0), T(0), G(0), C(0), gap(0){} + unsigned int A; + unsigned int T; + unsigned int G; + unsigned int C; + unsigned int gap; +}; + +/**************************************************************************************************/ + +class AlignNode : public TaxonomyNode { + +public: + AlignNode(string, int); + int loadSequence(string&); + int checkTheta(); + void printTheta(); + double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j) + double getSimToConsensus(string& query); + vector getTheta() { return theta; } + int addThetas(vector, int); + +private: + vector theta; + vector columnCounts; + int alignLength; +}; + +/**************************************************************************************************/ + +#endif + diff --git a/aligntree.cpp b/aligntree.cpp new file mode 100755 index 0000000..41667ca --- /dev/null +++ b/aligntree.cpp @@ -0,0 +1,371 @@ +// +// alignTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "alignnode.h" +#include "aligntree.h" + +/**************************************************************************************************/ + +AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){ + try { + AlignNode* newNode = new AlignNode("Root", 0); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + map lengths; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + string aligned = seq.getAligned(); + lengths[aligned.length()] = 1; + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; } + addTaxonomyToTree(seq.getName(), refTaxonomy, aligned); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + length = (lengths.begin())->first; + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setTotalSeqs(dbSize); + } + + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +AlignTree::~AlignTree(){ + try { + for(int i=0;ierrorOut(e, "AlignTree", "~AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){ + try { + AlignNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName); // look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new AlignNode(taxonName, level); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "addTaxonomyToTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--){ + if (m->control_pressed) { return 0; } + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignTree::getOutlierLogProbability(string& sequence){ + try { + double count = 0; + + for(int i=0;ierrorOut(e, "AlignTree", "getOutlierLogProbability"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::getMinRiskIndexAlign(string& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "AlignTree", "getMinRiskIndexAlign"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int AlignTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ + +string AlignTree::getTaxonomy(Sequence* seq){ + try { + string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = ""; + if (querySequence.length() != length) { + m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return ""; + } + double logPOutlier = getOutlierLogProbability(querySequence); + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + + for(int i=0;igetName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl; + if (m->control_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + + //cout << "before best level" << endl; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int numTaxaInLevel = (int)indices[i].size(); + + //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl; + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString + "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ diff --git a/aligntree.h b/aligntree.h new file mode 100755 index 0000000..51008ff --- /dev/null +++ b/aligntree.h @@ -0,0 +1,34 @@ +// +// alignTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_alignTree_h +#define pdsBayesian_alignTree_h + +#include "classify.h" + +class AlignNode; + +class AlignTree : public Classify { + +public: + AlignTree(string, string, int); + ~AlignTree(); + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string&, string&); + double getOutlierLogProbability(string&); + int getMinRiskIndexAlign(string&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int numSeqs, confidenceThreshold, length; + vector tree; +}; + +#endif diff --git a/classify.cpp b/classify.cpp index 459c90f..8aa3cdb 100644 --- a/classify.cpp +++ b/classify.cpp @@ -354,3 +354,37 @@ vector Classify::parseTax(string tax) { } /**************************************************************************************************/ +double Classify::getLogExpSum(vector probabilities, int& maxIndex){ + try { + // http://jblevins.org/notes/log-sum-exp + + double maxProb = probabilities[0]; + maxIndex = 0; + + int numProbs = (int)probabilities.size(); + + for(int i=1;i= maxProb){ + maxProb = probabilities[i]; + maxIndex = i; + } + } + + double probSum = 0.0000; + + for(int i=0;ierrorOut(e, "Classify", "getLogExpSum"); + exit(1); + } +} + +/**************************************************************************************************/ + diff --git a/classify.h b/classify.h index 6582be4..7b4c102 100644 --- a/classify.h +++ b/classify.h @@ -17,10 +17,8 @@ #include "database.hpp" #include "phylotree.h" - class Sequence; - /**************************************************************************************************/ class Classify { @@ -37,7 +35,6 @@ public: protected: map taxonomy; //name maps to taxonomy - //map genusCount; //maps genus to count - in essence a list of how many seqs are in each taxonomy map::iterator itTax; map::iterator it; Database* database; @@ -45,11 +42,12 @@ protected: string taxFile, templateFile, simpleTax; vector names; - int threadID; + int threadID, numLevels, numTaxa; bool flip, flipped, shortcuts; int readTaxonomy(string); vector parseTax(string); + double getLogExpSum(vector, int&); MothurOut* m; }; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 2e55673..36bccc2 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -21,9 +21,9 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch); + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod); + CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); @@ -55,11 +55,11 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"; helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; - helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"; + helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n"; helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; - helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"; + helpString += "The method parameter allows you to specify classification method to use. Your options are: wang, knn and zap. The default is wang.\n"; helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"; helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n"; #ifdef USE_MPI @@ -72,8 +72,8 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"; helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; - helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; - helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n"; + helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method. The default is 100.\n"; //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; @@ -492,9 +492,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; - temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } - m->mothurConvert(temp, kmerSize); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -536,7 +533,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } - method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "wang"; } + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ + temp = "8"; + if (method == "zap") { temp = "7"; } + } + m->mothurConvert(temp, kmerSize); temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; } m->mothurConvert(temp, match); @@ -570,8 +573,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { - m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); + if ((method == "wang") && (search != "kmer")) { + m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); + search = "kmer"; + } + + if ((method == "zap") && ((search != "kmer") && (search != "align"))) { + m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); search = "kmer"; } @@ -605,10 +613,16 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } + string outputMethodTag = method + "."; + if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } + else if(method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } + else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } + } else { - m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); + m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } @@ -633,10 +647,10 @@ int ClassifySeqsCommand::execute(){ if (RippedTaxName != "") { RippedTaxName += "."; } if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } - string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy"); - string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos"); + string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy"); + string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos"); string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; - string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary"); + string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary"); if ((method == "knn") && (search == "distance")) { string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist"); @@ -798,12 +812,12 @@ int ClassifySeqsCommand::execute(){ if (hasCount) { ct = new CountTable(); ct->readTable(countfileNames[s]); - taxaSum = new PhyloSummary(baseTName, ct); + taxaSum = new PhyloSummary(taxonomyFileName, ct); taxaSum->summarize(tempTaxonomyFile); }else { if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } - taxaSum = new PhyloSummary(baseTName, groupMap); + taxaSum = new PhyloSummary(taxonomyFileName, groupMap); if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 6d11d92..0cffec6 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -19,9 +19,11 @@ #include "phylotree.h" #include "phylosummary.h" #include "knn.h" +#include "kmertree.h" +#include "aligntree.h" -//KNN and Bayesian methods modeled from algorithms in +//KNN and Wang methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences //into the New Bacterial Taxonomy􏰎† //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* @@ -166,6 +168,11 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ Classify* myclassify; if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else if(pDataArray->method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } + else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } + } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); diff --git a/kmernode.cpp b/kmernode.cpp new file mode 100755 index 0000000..c087cac --- /dev/null +++ b/kmernode.cpp @@ -0,0 +1,209 @@ +/* + * kmerNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "kmernode.h" + + +/**********************************************************************************************************************/ + +KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) { + try { + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + numPossibleKmers = power4s[kmerSize]; + numUniqueKmers = 0; + + kmerVector.assign(numPossibleKmers, 0); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "KmerNode"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::loadSequence(vector& kmerProfile){ + try { + for(int i=0;icontrol_pressed) { break; } + if(kmerVector[i] == 0 && kmerProfile[i] != 0) { numUniqueKmers++; } + + kmerVector[i] += kmerProfile[i]; + } + + numSeqs++; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "loadSequence"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +string KmerNode::getKmerBases(int kmerNumber){ + try { + // Here we convert the kmer number into the kmer in terms of bases. + // + // Example: Score = 915 (for a 6-mer) + // Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T] + // Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT] + // Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT] + // Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT] + // Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT] + // Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + string kmer = ""; + + if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must + for(int i=0;icontrol_pressed) { return kmer; } + int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder + if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ] + else if(nt == 1){ kmer = 'C' + kmer; } + else if(nt == 2){ kmer = 'G' + kmer; } + else if(nt == 3){ kmer = 'T' + kmer; } + } + } + return kmer; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getKmerBases"); + exit(1); + } +} + +/**************************************************************************************************/ + +void KmerNode::addThetas(vector newTheta, int newNumSeqs){ + try { + for(int i=0;icontrol_pressed) { break; } + kmerVector[i] += newTheta[i]; + } + + // if(alignLength == 0){ + // alignLength = (int)newTheta.size(); + // theta.resize(alignLength); + // columnCounts.resize(alignLength); + // } + // + // for(int i=0;ierrorOut(e, "KmerNode", "addThetas"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +int KmerNode::getNumUniqueKmers(){ + try { + if(numUniqueKmers == 0){ + + for(int i=0;icontrol_pressed) { return numUniqueKmers; } + if(kmerVector[i] != 0){ + numUniqueKmers++; + } + + } + + } + + return numUniqueKmers; + + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getNumUniqueKmers"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::printTheta(){ + try { + m->mothurOut(name + "\n"); + for(int i=0;imothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n"); + } + } + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "printTheta"); + exit(1); + } + +} +/**************************************************************************************************/ + +double KmerNode::getSimToConsensus(vector& queryKmerProfile){ + try { + double present = 0; + + for(int i=0;icontrol_pressed) { return present; } + if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){ + present++; + } + } + + return present / double(queryKmerProfile.size() - kmerSize + 1); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getSimToConsensus"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +double KmerNode::getPxGivenkj_D_j(vector& queryKmerProfile) { + try { + double sumLogProb = 0.0000; + double alpha = 1.0 / (double)totalSeqs; //flat prior + // double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001; //non-flat prior + + for(int i=0;icontrol_pressed) { return sumLogProb; } + if(queryKmerProfile[i] != 0){ //numUniqueKmers needs to be the value from Root; + sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha)); + } + + } + return sumLogProb; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getPxGivenkj_D_j"); + exit(1); + } + +} + +/**********************************************************************************************************************/ diff --git a/kmernode.h b/kmernode.h new file mode 100755 index 0000000..e15fb1d --- /dev/null +++ b/kmernode.h @@ -0,0 +1,45 @@ +#ifndef KMERNODE +#define KMERNODE + +/* + * kmerNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + + +#include "taxonomynode.h" + +/**********************************************************************************************************************/ + +class KmerNode : public TaxonomyNode { + +public: + KmerNode(string, int, int); + void loadSequence(vector&); + void printTheta(); + double getPxGivenkj_D_j(vector&); + double getSimToConsensus(vector&); + void checkTheta(){}; + void setNumUniqueKmers(int num) { numUniqueKmers = num; } + int getNumUniqueKmers(); + void addThetas(vector, int); + vector getTheta() { return kmerVector; } + + +private: + string getKmerBases(int); + int kmerSize; // value of k + int numPossibleKmers; // 4^kmerSize + int numUniqueKmers; // number of unique kmers seen in a group ~ O_kj + int numKmers; // number of kmers in a sequence + vector kmerVector; // counts of kmers across all sequences in a node +}; + +/**********************************************************************************************************************/ + +#endif + diff --git a/kmertree.cpp b/kmertree.cpp new file mode 100755 index 0000000..fbf2bfb --- /dev/null +++ b/kmertree.cpp @@ -0,0 +1,386 @@ +// +// kmerTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "kmernode.h" +#include "kmertree.h" + +/**************************************************************************************************/ + +KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){ + try { + KmerNode* newNode = new KmerNode("Root", 0, kmerSize); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + numPossibleKmers = power4s[kmerSize]; + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + vector kmerProfile = ripKmerProfile(seq.getUnaligned()); //convert to kmer vector + addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers()); + tree[i]->setTotalSeqs(dbSize); + } + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "KmerTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +KmerTree::~KmerTree(){ + + for(int i=0;i KmerTree::ripKmerProfile(string sequence){ + try { + // assume all input sequences are unaligned + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + int nKmers = (int)sequence.length() - kmerSize + 1; + + vector kmerProfile(numPossibleKmers + 1, 0); + + for(int i=0;icontrol_pressed) { break; } + + int kmer = 0; + for(int j=0;jerrorOut(e, "KmerTree", "ripKmerProfile"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector& sequence){ + try { + KmerNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new KmerNode(taxonName, level, kmerSize); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "addTaxonomyToTree"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int KmerTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--) { + if (m->control_pressed) { return 0; } + + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::getMinRiskIndexKmer(vector& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever. + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "KmerTree", "getMinRiskIndexKmer"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ +string KmerTree::getTaxonomy(Sequence* thisSeq){ + try { + string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = ""; + string unalignedSeq = thisSeq->getUnaligned(); + + double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers()); + + vector queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + + int numTaxaInLevel = (int)indices[i].size(); + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString += "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ + diff --git a/kmertree.h b/kmertree.h new file mode 100755 index 0000000..f7c10ef --- /dev/null +++ b/kmertree.h @@ -0,0 +1,37 @@ +// +// kmerTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_kmerTree_h +#define pdsBayesian_kmerTree_h + +#include "classify.h" + +class KmerNode; + +class KmerTree : public Classify { + +public: + KmerTree(string, string, int, int); + ~KmerTree(); + + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string, vector&); + vector ripKmerProfile(string); + int getMinRiskIndexKmer(vector&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int kmerSize; + int numPossibleKmers, confidenceThreshold; + vector tree; + +}; + +#endif diff --git a/taxonomynode.cpp b/taxonomynode.cpp new file mode 100755 index 0000000..b90bda1 --- /dev/null +++ b/taxonomynode.cpp @@ -0,0 +1,72 @@ +/* + * taxonomynode.cpp + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){ + m = MothurOut::getInstance(); + parent = -1; + numChildren = 0; + numSeqs = 0; +} + +/**************************************************************************************************/ + +void TaxonomyNode::setName(string n) { name = n; } + +/**************************************************************************************************/ + +string TaxonomyNode::getName() { return name; } + +/**************************************************************************************************/ + +void TaxonomyNode::setParent(int p) { parent = p; } + +/**************************************************************************************************/ + +int TaxonomyNode::getParent() { return parent; } + +/**************************************************************************************************/ + +void TaxonomyNode::makeChild(string c, int i) { children[c] = i; } + + +/**************************************************************************************************/ + +map TaxonomyNode::getChildren() { return children; } + +/**************************************************************************************************/ + +int TaxonomyNode::getChildIndex(string c){ + map::iterator it = children.find(c); + if(it != children.end()) { return it->second; } + else { return -1; } +} + +/**************************************************************************************************/ + +int TaxonomyNode::getNumKids() { return (int)children.size(); } + +/**************************************************************************************************/ + +int TaxonomyNode::getNumSeqs() { return numSeqs; } + +/**************************************************************************************************/ + +void TaxonomyNode::setTotalSeqs(int n) { totalSeqs = n; } + +/**************************************************************************************************/ + +int TaxonomyNode::getLevel() { return level; } + +/**************************************************************************************************/ diff --git a/taxonomynode.h b/taxonomynode.h new file mode 100755 index 0000000..08bad3e --- /dev/null +++ b/taxonomynode.h @@ -0,0 +1,53 @@ +#ifndef TAXONOMYNODE +#define TAXONOMYNODE + +/* + * taxonomynode.h + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "mothurout.h" +/**************************************************************************************************/ + +class TaxonomyNode { + +public: + TaxonomyNode(); + TaxonomyNode(string, int); + void setName(string); + string getName(); + + + void setParent(int); + int getParent(); + + void makeChild(string, int); + map getChildren(); + int getChildIndex(string); + int getNumKids(); + int getNumSeqs(); + void setTotalSeqs(int); + int getLevel(); + +private: + int parent; + map children; + int numChildren; + int level; + +protected: + MothurOut* m; + int numSeqs; + int totalSeqs; + string name; +}; + +/**************************************************************************************************/ + +#endif