From e8e13c129ba8184ec5932a090773f353f3ae3406 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 3 Oct 2012 14:51:10 -0400 Subject: [PATCH] added zap method to classify.seqs and changed bayesian method name to wang. --- Mothur.xcodeproj/project.pbxproj | 32 ++- alignnode.cpp | 257 ++++++++++++++++++++ alignnode.h | 49 ++++ aligntree.cpp | 371 +++++++++++++++++++++++++++++ aligntree.h | 34 +++ classify.cpp | 34 +++ classify.h | 6 +- classifyseqscommand.cpp | 52 +++-- classifyseqscommand.h | 9 +- kmernode.cpp | 209 +++++++++++++++++ kmernode.h | 45 ++++ kmertree.cpp | 386 +++++++++++++++++++++++++++++++ kmertree.h | 37 +++ taxonomynode.cpp | 72 ++++++ taxonomynode.h | 53 +++++ 15 files changed, 1621 insertions(+), 25 deletions(-) create mode 100755 alignnode.cpp create mode 100755 alignnode.h create mode 100755 aligntree.cpp create mode 100755 aligntree.h create mode 100755 kmernode.cpp create mode 100755 kmernode.h create mode 100755 kmertree.cpp create mode 100755 kmertree.h create mode 100755 taxonomynode.cpp create mode 100755 taxonomynode.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 9d6261b..b2895e8 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -19,6 +19,11 @@ A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; }; A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; }; A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; }; + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; }; + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; }; + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; }; + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; }; + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; }; A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; }; @@ -395,6 +400,16 @@ A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = ""; }; A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = ""; }; A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = ""; }; + A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = ""; }; + A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = ""; }; + A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = ""; }; + A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = ""; }; + A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = ""; }; + A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = ""; }; + A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = ""; }; + A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = ""; }; + A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = ""; }; + A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = ""; }; A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = ""; }; A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; @@ -1775,10 +1790,18 @@ A7E9BA4B12D3966900DA6239 /* classifier */ = { isa = PBXGroup; children = ( - A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, + A721AB67161C570F009860A1 /* alignnode.h */, + A721AB66161C570F009860A1 /* alignnode.cpp */, + A721AB69161C570F009860A1 /* aligntree.h */, + A721AB68161C570F009860A1 /* aligntree.cpp */, A7E9B65B12D37EC300DA6239 /* bayesian.h */, + A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, A7E9B68E12D37EC400DA6239 /* classify.cpp */, A7E9B68F12D37EC400DA6239 /* classify.h */, + A721AB6E161C572A009860A1 /* kmernode.h */, + A721AB6D161C572A009860A1 /* kmernode.cpp */, + A721AB70161C572A009860A1 /* kmertree.h */, + A721AB6F161C572A009860A1 /* kmertree.cpp */, A7E9B73812D37EC400DA6239 /* knn.h */, A7E9B73712D37EC400DA6239 /* knn.cpp */, A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */, @@ -1787,6 +1810,8 @@ A7E9B79012D37EC400DA6239 /* phylotree.h */, A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */, A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */, + A721AB74161C573B009860A1 /* taxonomynode.h */, + A721AB73161C573B009860A1 /* taxonomynode.cpp */, ); name = classifier; sourceTree = ""; @@ -2250,6 +2275,11 @@ A7386C29161A110800651424 /* decisiontree.cpp in Sources */, A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */, A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */, + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */, + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */, + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */, + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */, + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/alignnode.cpp b/alignnode.cpp new file mode 100755 index 0000000..ccd8fb0 --- /dev/null +++ b/alignnode.cpp @@ -0,0 +1,257 @@ +/* + * alignNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "alignNode.h" +#include "taxonomynode.h" + +#include "bayesian.h" + +/**************************************************************************************************/ + +AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){ + + alignLength = 0; +} + +/**************************************************************************************************/ + +void AlignNode::printTheta(){ + try { + m->mothurOut("A:\t"); for(int i=0;imothurOut(toString(theta[i].A)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("T:\t"); for(int i=0;imothurOut(toString(theta[i].T)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("G:\t"); for(int i=0;imothurOut(toString(theta[i].G)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("C:\t"); for(int i=0;imothurOut(toString(theta[i].C)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("I:\t"); for(int i=0;imothurOut(toString(theta[i].gap)+ '\t'); } m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "printTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::loadSequence(string& sequence){ + try { + alignLength = (int)sequence.length(); // this function runs through the alignment and increments the frequency + // of each base for a particular taxon. we are building the thetas + + if(theta.size() == 0){ + theta.resize(alignLength); + columnCounts.resize(alignLength, 0); + } + + for(int i=0;icontrol_pressed) { return 0; } + + char base = sequence[i]; + + if(base == 'A') { theta[i].A++; columnCounts[i]++; } // our thetas will be alignLength x 5 + else if(base == 'T'){ theta[i].T++; columnCounts[i]++; } // and we ignore any position that has + else if(base == 'G'){ theta[i].G++; columnCounts[i]++; } // an ambiguous base call + else if(base == 'C'){ theta[i].C++; columnCounts[i]++; } + else if(base == '-'){ theta[i].gap++; columnCounts[i]++; } + else if(base == 'U'){ theta[i].T++; columnCounts[i]++; } + } + + numSeqs++; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "loadSequence"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::checkTheta(){ + try { + for(int i=0;icontrol_pressed) { return 0; } + + if(theta[i].gap == columnCounts[i]){ + columnCounts[i] = 0; + } + // else{ + // int maxCount = theta[i].A; + // + // if(theta[i].T > maxCount) { maxCount = theta[i].T; } + // if(theta[i].G > maxCount) { maxCount = theta[i].T; } + // if(theta[i].C > maxCount) { maxCount = theta[i].T; } + // if(theta[i].gap > maxCount) { maxCount = theta[i].T; } + // + // if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){ //remove any column where the maximum frequency is <50% + // columnCounts[i] = 0; + // } + // } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "checkTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::addThetas(vector newTheta, int newNumSeqs){ + try { + if(alignLength == 0){ + alignLength = (int)newTheta.size(); + theta.resize(alignLength); + columnCounts.resize(alignLength); + } + + for(int i=0;icontrol_pressed) { return 0; } + + theta[i].A += newTheta[i].A; columnCounts[i] += newTheta[i].A; + theta[i].T += newTheta[i].T; columnCounts[i] += newTheta[i].T; + theta[i].G += newTheta[i].G; columnCounts[i] += newTheta[i].G; + theta[i].C += newTheta[i].C; columnCounts[i] += newTheta[i].C; + theta[i].gap += newTheta[i].gap; columnCounts[i] += newTheta[i].gap; + } + + numSeqs += newNumSeqs; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "addThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getSimToConsensus(string& query){ + try { + double similarity = 0; + + int length = 0; + + for(int i=0;icontrol_pressed) { return similarity; } + + char base = query[i]; + + if(base != '.' && base != 'N' && columnCounts[i] != 0){ + + double fraction = 0; + + if(base == 'A'){ + fraction = (int) theta[i].A / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'T'){ + fraction = (int) theta[i].T / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'G'){ + fraction = (int) theta[i].G / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'C'){ + fraction = (int) theta[i].C / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == '-'){ + fraction = (int) theta[i].gap / (double) columnCounts[i]; + similarity += fraction; + length++; + } + } + } + + if(length != 0){ + similarity /= double(length); + } + else { + similarity = 0; + } + + return similarity; + + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getSimToConsensus"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getPxGivenkj_D_j(string& query){ //P(x | k_j, D, j) + try { + double PxGivenkj_D_j = 0; + + int count = 0; + double alpha = 1 / (double)totalSeqs; //flat prior + + + for(int s=0;scontrol_pressed) { return PxGivenkj_D_j; } + + char base = query[s]; + thetaAlign thetaS = theta[s]; + + if(base != '.' && base != 'N' && columnCounts[s] != 0){ + double Nkj_s = (double)columnCounts[s]; + double nkj_si = 0; + + + if(base == 'A') { nkj_si = (double)thetaS.A; } + else if(base == 'T'){ nkj_si = (double)thetaS.T; } + else if(base == 'G'){ nkj_si = (double)thetaS.G; } + else if(base == 'C'){ nkj_si = (double)thetaS.C; } + else if(base == '-'){ nkj_si = (double)thetaS.gap; } + else if(base == 'U'){ nkj_si = (double)thetaS.T; } + + // double alpha = pow(0.2, double(Nkj_s)) + 0.0001; //need to make 1e-4 a variable in future; this is the non-flat prior + + // if(columnCounts[s] != nkj_si){ //deal only with segregating sites... + double numerator = nkj_si + alpha; + double denomenator = Nkj_s + 5.0 * alpha; + + PxGivenkj_D_j += log(numerator) - log(denomenator); + count++; + // } + } + if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){ + count = 0; + break; + } + + } + + if(count == 0){ PxGivenkj_D_j = -1e10; } + + return PxGivenkj_D_j; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getPxGivenkj_D_j"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/alignnode.h b/alignnode.h new file mode 100755 index 0000000..4aecca7 --- /dev/null +++ b/alignnode.h @@ -0,0 +1,49 @@ +#ifndef ALIGNNODE +#define ALIGNNODE + +/* + * alignNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +struct thetaAlign { + thetaAlign() : A(0), T(0), G(0), C(0), gap(0){} + unsigned int A; + unsigned int T; + unsigned int G; + unsigned int C; + unsigned int gap; +}; + +/**************************************************************************************************/ + +class AlignNode : public TaxonomyNode { + +public: + AlignNode(string, int); + int loadSequence(string&); + int checkTheta(); + void printTheta(); + double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j) + double getSimToConsensus(string& query); + vector getTheta() { return theta; } + int addThetas(vector, int); + +private: + vector theta; + vector columnCounts; + int alignLength; +}; + +/**************************************************************************************************/ + +#endif + diff --git a/aligntree.cpp b/aligntree.cpp new file mode 100755 index 0000000..41667ca --- /dev/null +++ b/aligntree.cpp @@ -0,0 +1,371 @@ +// +// alignTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "alignnode.h" +#include "aligntree.h" + +/**************************************************************************************************/ + +AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){ + try { + AlignNode* newNode = new AlignNode("Root", 0); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + map lengths; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + string aligned = seq.getAligned(); + lengths[aligned.length()] = 1; + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; } + addTaxonomyToTree(seq.getName(), refTaxonomy, aligned); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + length = (lengths.begin())->first; + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setTotalSeqs(dbSize); + } + + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +AlignTree::~AlignTree(){ + try { + for(int i=0;ierrorOut(e, "AlignTree", "~AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){ + try { + AlignNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName); // look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new AlignNode(taxonName, level); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "addTaxonomyToTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--){ + if (m->control_pressed) { return 0; } + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignTree::getOutlierLogProbability(string& sequence){ + try { + double count = 0; + + for(int i=0;ierrorOut(e, "AlignTree", "getOutlierLogProbability"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::getMinRiskIndexAlign(string& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "AlignTree", "getMinRiskIndexAlign"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int AlignTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ + +string AlignTree::getTaxonomy(Sequence* seq){ + try { + string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = ""; + if (querySequence.length() != length) { + m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return ""; + } + double logPOutlier = getOutlierLogProbability(querySequence); + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + + for(int i=0;igetName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl; + if (m->control_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + + //cout << "before best level" << endl; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int numTaxaInLevel = (int)indices[i].size(); + + //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl; + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString + "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ diff --git a/aligntree.h b/aligntree.h new file mode 100755 index 0000000..51008ff --- /dev/null +++ b/aligntree.h @@ -0,0 +1,34 @@ +// +// alignTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_alignTree_h +#define pdsBayesian_alignTree_h + +#include "classify.h" + +class AlignNode; + +class AlignTree : public Classify { + +public: + AlignTree(string, string, int); + ~AlignTree(); + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string&, string&); + double getOutlierLogProbability(string&); + int getMinRiskIndexAlign(string&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int numSeqs, confidenceThreshold, length; + vector tree; +}; + +#endif diff --git a/classify.cpp b/classify.cpp index 459c90f..8aa3cdb 100644 --- a/classify.cpp +++ b/classify.cpp @@ -354,3 +354,37 @@ vector Classify::parseTax(string tax) { } /**************************************************************************************************/ +double Classify::getLogExpSum(vector probabilities, int& maxIndex){ + try { + // http://jblevins.org/notes/log-sum-exp + + double maxProb = probabilities[0]; + maxIndex = 0; + + int numProbs = (int)probabilities.size(); + + for(int i=1;i= maxProb){ + maxProb = probabilities[i]; + maxIndex = i; + } + } + + double probSum = 0.0000; + + for(int i=0;ierrorOut(e, "Classify", "getLogExpSum"); + exit(1); + } +} + +/**************************************************************************************************/ + diff --git a/classify.h b/classify.h index 6582be4..7b4c102 100644 --- a/classify.h +++ b/classify.h @@ -17,10 +17,8 @@ #include "database.hpp" #include "phylotree.h" - class Sequence; - /**************************************************************************************************/ class Classify { @@ -37,7 +35,6 @@ public: protected: map taxonomy; //name maps to taxonomy - //map genusCount; //maps genus to count - in essence a list of how many seqs are in each taxonomy map::iterator itTax; map::iterator it; Database* database; @@ -45,11 +42,12 @@ protected: string taxFile, templateFile, simpleTax; vector names; - int threadID; + int threadID, numLevels, numTaxa; bool flip, flipped, shortcuts; int readTaxonomy(string); vector parseTax(string); + double getLogExpSum(vector, int&); MothurOut* m; }; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 2e55673..36bccc2 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -21,9 +21,9 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch); + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod); + CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); @@ -55,11 +55,11 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"; helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; - helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"; + helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n"; helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; - helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"; + helpString += "The method parameter allows you to specify classification method to use. Your options are: wang, knn and zap. The default is wang.\n"; helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"; helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n"; #ifdef USE_MPI @@ -72,8 +72,8 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"; helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; - helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; - helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n"; + helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method. The default is 100.\n"; //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; @@ -492,9 +492,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; - temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } - m->mothurConvert(temp, kmerSize); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -536,7 +533,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } - method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "wang"; } + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ + temp = "8"; + if (method == "zap") { temp = "7"; } + } + m->mothurConvert(temp, kmerSize); temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; } m->mothurConvert(temp, match); @@ -570,8 +573,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { - m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); + if ((method == "wang") && (search != "kmer")) { + m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); + search = "kmer"; + } + + if ((method == "zap") && ((search != "kmer") && (search != "align"))) { + m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); search = "kmer"; } @@ -605,10 +613,16 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } + string outputMethodTag = method + "."; + if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } + else if(method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } + else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } + } else { - m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); + m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } @@ -633,10 +647,10 @@ int ClassifySeqsCommand::execute(){ if (RippedTaxName != "") { RippedTaxName += "."; } if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } - string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy"); - string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos"); + string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy"); + string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos"); string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; - string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary"); + string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary"); if ((method == "knn") && (search == "distance")) { string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist"); @@ -798,12 +812,12 @@ int ClassifySeqsCommand::execute(){ if (hasCount) { ct = new CountTable(); ct->readTable(countfileNames[s]); - taxaSum = new PhyloSummary(baseTName, ct); + taxaSum = new PhyloSummary(taxonomyFileName, ct); taxaSum->summarize(tempTaxonomyFile); }else { if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } - taxaSum = new PhyloSummary(baseTName, groupMap); + taxaSum = new PhyloSummary(taxonomyFileName, groupMap); if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 6d11d92..0cffec6 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -19,9 +19,11 @@ #include "phylotree.h" #include "phylosummary.h" #include "knn.h" +#include "kmertree.h" +#include "aligntree.h" -//KNN and Bayesian methods modeled from algorithms in +//KNN and Wang methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences //into the New Bacterial Taxonomy􏰎† //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* @@ -166,6 +168,11 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ Classify* myclassify; if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else if(pDataArray->method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } + else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } + } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); diff --git a/kmernode.cpp b/kmernode.cpp new file mode 100755 index 0000000..c087cac --- /dev/null +++ b/kmernode.cpp @@ -0,0 +1,209 @@ +/* + * kmerNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "kmernode.h" + + +/**********************************************************************************************************************/ + +KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) { + try { + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + numPossibleKmers = power4s[kmerSize]; + numUniqueKmers = 0; + + kmerVector.assign(numPossibleKmers, 0); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "KmerNode"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::loadSequence(vector& kmerProfile){ + try { + for(int i=0;icontrol_pressed) { break; } + if(kmerVector[i] == 0 && kmerProfile[i] != 0) { numUniqueKmers++; } + + kmerVector[i] += kmerProfile[i]; + } + + numSeqs++; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "loadSequence"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +string KmerNode::getKmerBases(int kmerNumber){ + try { + // Here we convert the kmer number into the kmer in terms of bases. + // + // Example: Score = 915 (for a 6-mer) + // Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T] + // Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT] + // Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT] + // Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT] + // Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT] + // Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + string kmer = ""; + + if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must + for(int i=0;icontrol_pressed) { return kmer; } + int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder + if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ] + else if(nt == 1){ kmer = 'C' + kmer; } + else if(nt == 2){ kmer = 'G' + kmer; } + else if(nt == 3){ kmer = 'T' + kmer; } + } + } + return kmer; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getKmerBases"); + exit(1); + } +} + +/**************************************************************************************************/ + +void KmerNode::addThetas(vector newTheta, int newNumSeqs){ + try { + for(int i=0;icontrol_pressed) { break; } + kmerVector[i] += newTheta[i]; + } + + // if(alignLength == 0){ + // alignLength = (int)newTheta.size(); + // theta.resize(alignLength); + // columnCounts.resize(alignLength); + // } + // + // for(int i=0;ierrorOut(e, "KmerNode", "addThetas"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +int KmerNode::getNumUniqueKmers(){ + try { + if(numUniqueKmers == 0){ + + for(int i=0;icontrol_pressed) { return numUniqueKmers; } + if(kmerVector[i] != 0){ + numUniqueKmers++; + } + + } + + } + + return numUniqueKmers; + + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getNumUniqueKmers"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::printTheta(){ + try { + m->mothurOut(name + "\n"); + for(int i=0;imothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n"); + } + } + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "printTheta"); + exit(1); + } + +} +/**************************************************************************************************/ + +double KmerNode::getSimToConsensus(vector& queryKmerProfile){ + try { + double present = 0; + + for(int i=0;icontrol_pressed) { return present; } + if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){ + present++; + } + } + + return present / double(queryKmerProfile.size() - kmerSize + 1); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getSimToConsensus"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +double KmerNode::getPxGivenkj_D_j(vector& queryKmerProfile) { + try { + double sumLogProb = 0.0000; + double alpha = 1.0 / (double)totalSeqs; //flat prior + // double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001; //non-flat prior + + for(int i=0;icontrol_pressed) { return sumLogProb; } + if(queryKmerProfile[i] != 0){ //numUniqueKmers needs to be the value from Root; + sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha)); + } + + } + return sumLogProb; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getPxGivenkj_D_j"); + exit(1); + } + +} + +/**********************************************************************************************************************/ diff --git a/kmernode.h b/kmernode.h new file mode 100755 index 0000000..e15fb1d --- /dev/null +++ b/kmernode.h @@ -0,0 +1,45 @@ +#ifndef KMERNODE +#define KMERNODE + +/* + * kmerNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + + +#include "taxonomynode.h" + +/**********************************************************************************************************************/ + +class KmerNode : public TaxonomyNode { + +public: + KmerNode(string, int, int); + void loadSequence(vector&); + void printTheta(); + double getPxGivenkj_D_j(vector&); + double getSimToConsensus(vector&); + void checkTheta(){}; + void setNumUniqueKmers(int num) { numUniqueKmers = num; } + int getNumUniqueKmers(); + void addThetas(vector, int); + vector getTheta() { return kmerVector; } + + +private: + string getKmerBases(int); + int kmerSize; // value of k + int numPossibleKmers; // 4^kmerSize + int numUniqueKmers; // number of unique kmers seen in a group ~ O_kj + int numKmers; // number of kmers in a sequence + vector kmerVector; // counts of kmers across all sequences in a node +}; + +/**********************************************************************************************************************/ + +#endif + diff --git a/kmertree.cpp b/kmertree.cpp new file mode 100755 index 0000000..fbf2bfb --- /dev/null +++ b/kmertree.cpp @@ -0,0 +1,386 @@ +// +// kmerTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "kmernode.h" +#include "kmertree.h" + +/**************************************************************************************************/ + +KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){ + try { + KmerNode* newNode = new KmerNode("Root", 0, kmerSize); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + numPossibleKmers = power4s[kmerSize]; + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + vector kmerProfile = ripKmerProfile(seq.getUnaligned()); //convert to kmer vector + addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers()); + tree[i]->setTotalSeqs(dbSize); + } + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "KmerTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +KmerTree::~KmerTree(){ + + for(int i=0;i KmerTree::ripKmerProfile(string sequence){ + try { + // assume all input sequences are unaligned + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + int nKmers = (int)sequence.length() - kmerSize + 1; + + vector kmerProfile(numPossibleKmers + 1, 0); + + for(int i=0;icontrol_pressed) { break; } + + int kmer = 0; + for(int j=0;jerrorOut(e, "KmerTree", "ripKmerProfile"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector& sequence){ + try { + KmerNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new KmerNode(taxonName, level, kmerSize); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "addTaxonomyToTree"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int KmerTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--) { + if (m->control_pressed) { return 0; } + + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::getMinRiskIndexKmer(vector& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever. + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "KmerTree", "getMinRiskIndexKmer"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ +string KmerTree::getTaxonomy(Sequence* thisSeq){ + try { + string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = ""; + string unalignedSeq = thisSeq->getUnaligned(); + + double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers()); + + vector queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + + int numTaxaInLevel = (int)indices[i].size(); + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString += "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ + diff --git a/kmertree.h b/kmertree.h new file mode 100755 index 0000000..f7c10ef --- /dev/null +++ b/kmertree.h @@ -0,0 +1,37 @@ +// +// kmerTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_kmerTree_h +#define pdsBayesian_kmerTree_h + +#include "classify.h" + +class KmerNode; + +class KmerTree : public Classify { + +public: + KmerTree(string, string, int, int); + ~KmerTree(); + + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string, vector&); + vector ripKmerProfile(string); + int getMinRiskIndexKmer(vector&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int kmerSize; + int numPossibleKmers, confidenceThreshold; + vector tree; + +}; + +#endif diff --git a/taxonomynode.cpp b/taxonomynode.cpp new file mode 100755 index 0000000..b90bda1 --- /dev/null +++ b/taxonomynode.cpp @@ -0,0 +1,72 @@ +/* + * taxonomynode.cpp + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){ + m = MothurOut::getInstance(); + parent = -1; + numChildren = 0; + numSeqs = 0; +} + +/**************************************************************************************************/ + +void TaxonomyNode::setName(string n) { name = n; } + +/**************************************************************************************************/ + +string TaxonomyNode::getName() { return name; } + +/**************************************************************************************************/ + +void TaxonomyNode::setParent(int p) { parent = p; } + +/**************************************************************************************************/ + +int TaxonomyNode::getParent() { return parent; } + +/**************************************************************************************************/ + +void TaxonomyNode::makeChild(string c, int i) { children[c] = i; } + + +/**************************************************************************************************/ + +map TaxonomyNode::getChildren() { return children; } + +/**************************************************************************************************/ + +int TaxonomyNode::getChildIndex(string c){ + map::iterator it = children.find(c); + if(it != children.end()) { return it->second; } + else { return -1; } +} + +/**************************************************************************************************/ + +int TaxonomyNode::getNumKids() { return (int)children.size(); } + +/**************************************************************************************************/ + +int TaxonomyNode::getNumSeqs() { return numSeqs; } + +/**************************************************************************************************/ + +void TaxonomyNode::setTotalSeqs(int n) { totalSeqs = n; } + +/**************************************************************************************************/ + +int TaxonomyNode::getLevel() { return level; } + +/**************************************************************************************************/ diff --git a/taxonomynode.h b/taxonomynode.h new file mode 100755 index 0000000..08bad3e --- /dev/null +++ b/taxonomynode.h @@ -0,0 +1,53 @@ +#ifndef TAXONOMYNODE +#define TAXONOMYNODE + +/* + * taxonomynode.h + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "mothurout.h" +/**************************************************************************************************/ + +class TaxonomyNode { + +public: + TaxonomyNode(); + TaxonomyNode(string, int); + void setName(string); + string getName(); + + + void setParent(int); + int getParent(); + + void makeChild(string, int); + map getChildren(); + int getChildIndex(string); + int getNumKids(); + int getNumSeqs(); + void setTotalSeqs(int); + int getLevel(); + +private: + int parent; + map children; + int numChildren; + int level; + +protected: + MothurOut* m; + int numSeqs; + int totalSeqs; + string name; +}; + +/**************************************************************************************************/ + +#endif -- 2.39.2