A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
+ A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; };
+ A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; };
+ A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; };
+ A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; };
+ A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; };
A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; };
A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; };
A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = "<group>"; };
A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = "<group>"; };
A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = "<group>"; };
+ A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = "<group>"; };
+ A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = "<group>"; };
+ A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = "<group>"; };
+ A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = "<group>"; };
+ A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = "<group>"; };
+ A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = "<group>"; };
+ A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = "<group>"; };
+ A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = "<group>"; };
+ A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = "<group>"; };
+ A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = "<group>"; };
A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = "<group>"; };
A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = "<group>"; };
A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
A7E9BA4B12D3966900DA6239 /* classifier */ = {
isa = PBXGroup;
children = (
- A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
+ A721AB67161C570F009860A1 /* alignnode.h */,
+ A721AB66161C570F009860A1 /* alignnode.cpp */,
+ A721AB69161C570F009860A1 /* aligntree.h */,
+ A721AB68161C570F009860A1 /* aligntree.cpp */,
A7E9B65B12D37EC300DA6239 /* bayesian.h */,
+ A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
A7E9B68E12D37EC400DA6239 /* classify.cpp */,
A7E9B68F12D37EC400DA6239 /* classify.h */,
+ A721AB6E161C572A009860A1 /* kmernode.h */,
+ A721AB6D161C572A009860A1 /* kmernode.cpp */,
+ A721AB70161C572A009860A1 /* kmertree.h */,
+ A721AB6F161C572A009860A1 /* kmertree.cpp */,
A7E9B73812D37EC400DA6239 /* knn.h */,
A7E9B73712D37EC400DA6239 /* knn.cpp */,
A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */,
A7E9B79012D37EC400DA6239 /* phylotree.h */,
A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */,
A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */,
+ A721AB74161C573B009860A1 /* taxonomynode.h */,
+ A721AB73161C573B009860A1 /* taxonomynode.cpp */,
);
name = classifier;
sourceTree = "<group>";
A7386C29161A110800651424 /* decisiontree.cpp in Sources */,
A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */,
A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */,
+ A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */,
+ A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */,
+ A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
+ A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
+ A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+/*
+ * alignNode.cpp
+ * bayesian
+ *
+ * Created by Pat Schloss on 10/11/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "alignNode.h"
+#include "taxonomynode.h"
+
+#include "bayesian.h"
+
+/**************************************************************************************************/
+
+AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){
+
+ alignLength = 0;
+}
+
+/**************************************************************************************************/
+
+void AlignNode::printTheta(){
+ try {
+ m->mothurOut("A:\t"); for(int i=0;i<alignLength;i++){ m->mothurOut(toString(theta[i].A)+ '\t'); } m->mothurOutEndLine();
+ m->mothurOut("T:\t"); for(int i=0;i<alignLength;i++){ m->mothurOut(toString(theta[i].T)+ '\t'); } m->mothurOutEndLine();
+ m->mothurOut("G:\t"); for(int i=0;i<alignLength;i++){ m->mothurOut(toString(theta[i].G)+ '\t'); } m->mothurOutEndLine();
+ m->mothurOut("C:\t"); for(int i=0;i<alignLength;i++){ m->mothurOut(toString(theta[i].C)+ '\t'); } m->mothurOutEndLine();
+ m->mothurOut("I:\t"); for(int i=0;i<alignLength;i++){ m->mothurOut(toString(theta[i].gap)+ '\t'); } m->mothurOutEndLine();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "printTheta");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::loadSequence(string& sequence){
+ try {
+ alignLength = (int)sequence.length(); // this function runs through the alignment and increments the frequency
+ // of each base for a particular taxon. we are building the thetas
+
+ if(theta.size() == 0){
+ theta.resize(alignLength);
+ columnCounts.resize(alignLength, 0);
+ }
+
+ for(int i=0;i<alignLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ char base = sequence[i];
+
+ if(base == 'A') { theta[i].A++; columnCounts[i]++; } // our thetas will be alignLength x 5
+ else if(base == 'T'){ theta[i].T++; columnCounts[i]++; } // and we ignore any position that has
+ else if(base == 'G'){ theta[i].G++; columnCounts[i]++; } // an ambiguous base call
+ else if(base == 'C'){ theta[i].C++; columnCounts[i]++; }
+ else if(base == '-'){ theta[i].gap++; columnCounts[i]++; }
+ else if(base == 'U'){ theta[i].T++; columnCounts[i]++; }
+ }
+
+ numSeqs++;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "loadSequence");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::checkTheta(){
+ try {
+ for(int i=0;i<alignLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(theta[i].gap == columnCounts[i]){
+ columnCounts[i] = 0;
+ }
+ // else{
+ // int maxCount = theta[i].A;
+ //
+ // if(theta[i].T > maxCount) { maxCount = theta[i].T; }
+ // if(theta[i].G > maxCount) { maxCount = theta[i].T; }
+ // if(theta[i].C > maxCount) { maxCount = theta[i].T; }
+ // if(theta[i].gap > maxCount) { maxCount = theta[i].T; }
+ //
+ // if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){ //remove any column where the maximum frequency is <50%
+ // columnCounts[i] = 0;
+ // }
+ // }
+
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "checkTheta");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::addThetas(vector<thetaAlign> newTheta, int newNumSeqs){
+ try {
+ if(alignLength == 0){
+ alignLength = (int)newTheta.size();
+ theta.resize(alignLength);
+ columnCounts.resize(alignLength);
+ }
+
+ for(int i=0;i<alignLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ theta[i].A += newTheta[i].A; columnCounts[i] += newTheta[i].A;
+ theta[i].T += newTheta[i].T; columnCounts[i] += newTheta[i].T;
+ theta[i].G += newTheta[i].G; columnCounts[i] += newTheta[i].G;
+ theta[i].C += newTheta[i].C; columnCounts[i] += newTheta[i].C;
+ theta[i].gap += newTheta[i].gap; columnCounts[i] += newTheta[i].gap;
+ }
+
+ numSeqs += newNumSeqs;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "addThetas");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getSimToConsensus(string& query){
+ try {
+ double similarity = 0;
+
+ int length = 0;
+
+ for(int i=0;i<alignLength;i++){
+
+ if (m->control_pressed) { return similarity; }
+
+ char base = query[i];
+
+ if(base != '.' && base != 'N' && columnCounts[i] != 0){
+
+ double fraction = 0;
+
+ if(base == 'A'){
+ fraction = (int) theta[i].A / (double) columnCounts[i];
+ similarity += fraction;
+ length++;
+ }
+ else if(base == 'T'){
+ fraction = (int) theta[i].T / (double) columnCounts[i];
+ similarity += fraction;
+ length++;
+ }
+ else if(base == 'G'){
+ fraction = (int) theta[i].G / (double) columnCounts[i];
+ similarity += fraction;
+ length++;
+ }
+ else if(base == 'C'){
+ fraction = (int) theta[i].C / (double) columnCounts[i];
+ similarity += fraction;
+ length++;
+ }
+ else if(base == '-'){
+ fraction = (int) theta[i].gap / (double) columnCounts[i];
+ similarity += fraction;
+ length++;
+ }
+ }
+ }
+
+ if(length != 0){
+ similarity /= double(length);
+ }
+ else {
+ similarity = 0;
+ }
+
+ return similarity;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "getSimToConsensus");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getPxGivenkj_D_j(string& query){ //P(x | k_j, D, j)
+ try {
+ double PxGivenkj_D_j = 0;
+
+ int count = 0;
+ double alpha = 1 / (double)totalSeqs; //flat prior
+
+
+ for(int s=0;s<alignLength;s++){
+
+ if (m->control_pressed) { return PxGivenkj_D_j; }
+
+ char base = query[s];
+ thetaAlign thetaS = theta[s];
+
+ if(base != '.' && base != 'N' && columnCounts[s] != 0){
+ double Nkj_s = (double)columnCounts[s];
+ double nkj_si = 0;
+
+
+ if(base == 'A') { nkj_si = (double)thetaS.A; }
+ else if(base == 'T'){ nkj_si = (double)thetaS.T; }
+ else if(base == 'G'){ nkj_si = (double)thetaS.G; }
+ else if(base == 'C'){ nkj_si = (double)thetaS.C; }
+ else if(base == '-'){ nkj_si = (double)thetaS.gap; }
+ else if(base == 'U'){ nkj_si = (double)thetaS.T; }
+
+ // double alpha = pow(0.2, double(Nkj_s)) + 0.0001; //need to make 1e-4 a variable in future; this is the non-flat prior
+
+ // if(columnCounts[s] != nkj_si){ //deal only with segregating sites...
+ double numerator = nkj_si + alpha;
+ double denomenator = Nkj_s + 5.0 * alpha;
+
+ PxGivenkj_D_j += log(numerator) - log(denomenator);
+ count++;
+ // }
+ }
+ if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){
+ count = 0;
+ break;
+ }
+
+ }
+
+ if(count == 0){ PxGivenkj_D_j = -1e10; }
+
+ return PxGivenkj_D_j;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignNode", "getPxGivenkj_D_j");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef ALIGNNODE
+#define ALIGNNODE
+
+/*
+ * alignNode.h
+ * bayesian
+ *
+ * Created by Pat Schloss on 10/11/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+struct thetaAlign {
+ thetaAlign() : A(0), T(0), G(0), C(0), gap(0){}
+ unsigned int A;
+ unsigned int T;
+ unsigned int G;
+ unsigned int C;
+ unsigned int gap;
+};
+
+/**************************************************************************************************/
+
+class AlignNode : public TaxonomyNode {
+
+public:
+ AlignNode(string, int);
+ int loadSequence(string&);
+ int checkTheta();
+ void printTheta();
+ double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j)
+ double getSimToConsensus(string& query);
+ vector<thetaAlign> getTheta() { return theta; }
+ int addThetas(vector<thetaAlign>, int);
+
+private:
+ vector<thetaAlign> theta;
+ vector<unsigned int> columnCounts;
+ int alignLength;
+};
+
+/**************************************************************************************************/
+
+#endif
+
--- /dev/null
+//
+// alignTree.cpp
+// pdsBayesian
+//
+// Created by Patrick Schloss on 4/3/12.
+// Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "alignnode.h"
+#include "aligntree.h"
+
+/**************************************************************************************************/
+
+AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
+ try {
+ AlignNode* newNode = new AlignNode("Root", 0);
+ tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode
+
+ string refTaxonomy;
+
+ readTaxonomy(taxonomyFileName);
+
+ ifstream referenceFile;
+ m->openInputFile(referenceFileName, referenceFile);
+ bool error = false;
+ map<int, int> lengths;
+ while(!referenceFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(referenceFile); m->gobble(referenceFile);
+
+ if (seq.getName() != "") {
+ map<string, string>::iterator it = taxonomy.find(seq.getName());
+
+ if (it != taxonomy.end()) {
+ refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence
+ string aligned = seq.getAligned();
+ lengths[aligned.length()] = 1;
+ if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
+ addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
+ }else {
+ m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+ }
+ }
+ }
+ referenceFile.close();
+
+ length = (lengths.begin())->first;
+
+ if (error) { m->control_pressed = true; }
+
+ numTaxa = (int)tree.size();
+
+ numLevels = 0;
+ for(int i=0;i<numTaxa;i++){
+ int level = tree[i]->getLevel();
+ if(level > numLevels){ numLevels = level; }
+ }
+ numLevels++;
+
+ aggregateThetas();
+
+ int dbSize = tree[0]->getNumSeqs();
+
+ for(int i=0;i<numTaxa;i++){
+ tree[i]->checkTheta();
+ tree[i]->setTotalSeqs(dbSize);
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "AlignTree");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+AlignTree::~AlignTree(){
+ try {
+ for(int i=0;i<tree.size();i++){
+ delete tree[i];
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "~AlignTree");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
+ try {
+ AlignNode* newNode;
+ string taxonName = "";
+ int treePosition = 0; // the root is element 0
+
+ int level = 1;
+
+ for(int i=0;i<taxonomy.length();i++){ // step through taxonomy string...
+
+ if (m->control_pressed) { break; }
+
+ if(taxonomy[i] == ';'){ // looking for semicolons...
+
+ if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ int newIndex = tree[treePosition]->getChildIndex(taxonName); // look to see if your current node already
+ // has a child with the new taxonName
+ if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that
+ else { // position in the tree
+ int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one...
+ tree[treePosition]->makeChild(taxonName, newChildIndex);
+
+ newNode = new AlignNode(taxonName, level);
+
+ newNode->setParent(treePosition);
+
+ tree.push_back(newNode);
+ treePosition = newChildIndex;
+ }
+
+ // sequence data to that node to update that node's theta - seems slow...
+ taxonName = ""; // clear out the taxon name that we will build as we look
+ level++;
+ } // for a semicolon
+ else{
+ taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon
+ }
+ }
+ tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "addTaxonomyToTree");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::aggregateThetas(){
+ try {
+ vector<vector<int> > levelMatrix(numLevels+1);
+
+ for(int i=0;i<tree.size();i++){
+ if (m->control_pressed) { return 0; }
+ levelMatrix[tree[i]->getLevel()].push_back(i);
+ }
+
+ for(int i=numLevels-1;i>0;i--){
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<levelMatrix[i].size();j++){
+
+ AlignNode* holder = tree[levelMatrix[i][j]];
+
+ tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());
+ }
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "aggregateThetas");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double AlignTree::getOutlierLogProbability(string& sequence){
+ try {
+ double count = 0;
+
+ for(int i=0;i<sequence.length();i++){
+
+ if(sequence[i] != '.'){ count++; }
+
+ }
+
+ return count * log(0.2);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "getOutlierLogProbability");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+ try {
+ int numProbs = (int)probabilities.size();
+
+ vector<double> G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence
+ vector<double> risk(numProbs, 0);
+
+ for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+ if (m->control_pressed) { return 0; }
+ G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+ }
+
+ double minRisk = 1e6;
+ int minRiskIndex = 0;
+
+ for(int i=0;i<numProbs;i++){
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<numProbs;j++){
+ if(i != j){
+ risk[i] += probabilities[j] * G[j];
+ }
+ }
+
+ if(risk[i] < minRisk){
+ minRisk = risk[i];
+ minRiskIndex = i;
+ }
+ }
+
+ return minRiskIndex;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
+ exit(1);
+ }
+
+}
+
+/**************************************************************************************************/
+
+int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+ try {
+ int finalLevel = (int)indices.size()-1;
+
+ for(int position=1;position<indices.size();position++){
+ if (m->control_pressed) { return 0; }
+ int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+ int actualParent = indices[position-1][maxIndices[position-1]];
+
+ if(predictedParent != actualParent){
+ finalLevel = position - 1;
+ return finalLevel;
+ }
+ }
+ return finalLevel;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "sanityCheck");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string AlignTree::getTaxonomy(Sequence* seq){
+ try {
+ string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
+ if (querySequence.length() != length) {
+ m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return "";
+ }
+ double logPOutlier = getOutlierLogProbability(querySequence);
+
+ vector<vector<double> > pXgivenKj_D_j(numLevels);
+ vector<vector<int> > indices(numLevels);
+ for(int i=0;i<numLevels;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ pXgivenKj_D_j[i].push_back(logPOutlier);
+ indices[i].push_back(-1);
+ }
+
+
+ for(int i=0;i<numTaxa;i++){
+ // cout << i << '\t' << tree[i]->getName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl;
+ if (m->control_pressed) { return taxonProbabilityString; }
+ pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
+ indices[tree[i]->getLevel()].push_back(i);
+ }
+
+ vector<double> sumLikelihood(numLevels, 0);
+ vector<double> bestPosterior(numLevels, 0);
+ vector<int> maxIndex(numLevels, 0);
+ int maxPosteriorIndex;
+
+
+ //cout << "before best level" << endl;
+
+ //let's find the best level and taxa within that level
+ for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+ if (m->control_pressed) { return taxonProbabilityString; }
+ int numTaxaInLevel = (int)indices[i].size();
+
+ //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl;
+
+ vector<double> posteriors(numTaxaInLevel, 0);
+ sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+
+ maxPosteriorIndex = 0;
+ for(int j=0;j<numTaxaInLevel;j++){
+ posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+
+ if(posteriors[j] > posteriors[maxPosteriorIndex]){
+ maxPosteriorIndex = j;
+ }
+
+ }
+
+ maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
+
+ maxIndex[i] = maxPosteriorIndex;
+ bestPosterior[i] = posteriors[maxIndex[i]];
+ }
+
+ // vector<double> pX_level(numLevels, 0);
+ //
+ // for(int i=0;i<numLevels;i++){
+ // pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+ // }
+ //
+ // int max_pLevel_X_index = -1;
+ // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+ // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+ //
+ // vector<double> pLevel_X(numLevels, 0);
+ // for(int i=0;i<numLevels;i++){
+ // pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+ // }
+
+
+
+
+ int saneDepth = sanityCheck(indices, maxIndex);
+
+ simpleTax = "";
+ int savedspot = 1;
+ taxonProbabilityString = "";
+ for(int i=1;i<=saneDepth;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ int confidenceScore = (int) (bestPosterior[i] * 100);
+ if (confidenceScore >= confidenceThreshold) {
+ if(indices[i][maxIndex[i]] != -1){
+ taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
+ simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+ // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+ }
+ else{
+ taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
+ // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+ simpleTax += "unclassified;";
+ }
+ }else { break; }
+ savedspot = i;
+ }
+
+ for(int i=savedspot+1;i<numLevels;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ taxonProbabilityString + "unclassified(0);";
+ simpleTax += "unclassified;";
+ }
+
+ return taxonProbabilityString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "AlignTree", "getTaxonomy");
+ exit(1);
+ }
+}
+
+
+/**************************************************************************************************/
--- /dev/null
+//
+// alignTree.h
+// pdsBayesian
+//
+// Created by Patrick Schloss on 4/3/12.
+// Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_alignTree_h
+#define pdsBayesian_alignTree_h
+
+#include "classify.h"
+
+class AlignNode;
+
+class AlignTree : public Classify {
+
+public:
+ AlignTree(string, string, int);
+ ~AlignTree();
+ string getTaxonomy(Sequence*);
+
+private:
+ int addTaxonomyToTree(string, string&, string&);
+ double getOutlierLogProbability(string&);
+ int getMinRiskIndexAlign(string&, vector<int>&, vector<double>&);
+ int aggregateThetas();
+ int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+ int numSeqs, confidenceThreshold, length;
+ vector<AlignNode*> tree;
+};
+
+#endif
}
/**************************************************************************************************/
+double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
+ try {
+ // http://jblevins.org/notes/log-sum-exp
+
+ double maxProb = probabilities[0];
+ maxIndex = 0;
+
+ int numProbs = (int)probabilities.size();
+
+ for(int i=1;i<numProbs;i++){
+ if(probabilities[i] >= maxProb){
+ maxProb = probabilities[i];
+ maxIndex = i;
+ }
+ }
+
+ double probSum = 0.0000;
+
+ for(int i=0;i<numProbs;i++){
+ probSum += exp(probabilities[i] - maxProb);
+ }
+
+ probSum = log(probSum) + maxProb;
+
+ return probSum;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "getLogExpSum");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
#include "database.hpp"
#include "phylotree.h"
-
class Sequence;
-
/**************************************************************************************************/
class Classify {
protected:
map<string, string> taxonomy; //name maps to taxonomy
- //map<string, int> genusCount; //maps genus to count - in essence a list of how many seqs are in each taxonomy
map<string, int>::iterator itTax;
map<string, string>::iterator it;
Database* database;
string taxFile, templateFile, simpleTax;
vector<string> names;
- int threadID;
+ int threadID, numLevels, numTaxa;
bool flip, flipped, shortcuts;
int readTaxonomy(string);
vector<string> parseTax(string);
+ double getLogExpSum(vector<double>, int&);
MothurOut* m;
};
CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
- CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
+ CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch);
CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
- CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
+ CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
- helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
+ helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n";
helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
- helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n";
+ helpString += "The method parameter allows you to specify classification method to use. Your options are: wang, knn and zap. The default is wang.\n";
helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n";
helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
#ifdef USE_MPI
helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n";
helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n";
- helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
- helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n";
+ helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n";
+ helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method. The default is 100.\n";
//helpString += "The flip parameter allows you shut off mothur's The default is T.\n";
helpString += "The classify.seqs command should be in the following format: \n";
helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
string temp;
- temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; }
- m->mothurConvert(temp, kmerSize);
-
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
- method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "wang"; }
+
+ temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){
+ temp = "8";
+ if (method == "zap") { temp = "7"; }
+ }
+ m->mothurConvert(temp, kmerSize);
temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
m->mothurConvert(temp, match);
m->mothurConvert(temp, iters);
- if ((method == "bayesian") && (search != "kmer")) {
- m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
+ if ((method == "wang") && (search != "kmer")) {
+ m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
+ search = "kmer";
+ }
+
+ if ((method == "zap") && ((search != "kmer") && (search != "align"))) {
+ m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
search = "kmer";
}
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); }
+ string outputMethodTag = method + ".";
+ if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); }
else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); }
+ else if(method == "zap"){
+ outputMethodTag = search + "_" + outputMethodTag;
+ if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); }
+ else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); }
+ }
else {
- m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
+ m->mothurOut(search + " is not a valid method option. I will run the command using wang.");
m->mothurOutEndLine();
classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);
}
if (RippedTaxName != "") { RippedTaxName += "."; }
if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
- string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy");
- string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos");
+ string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy");
+ string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos");
string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
- string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary");
+ string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary");
if ((method == "knn") && (search == "distance")) {
string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist");
if (hasCount) {
ct = new CountTable();
ct->readTable(countfileNames[s]);
- taxaSum = new PhyloSummary(baseTName, ct);
+ taxaSum = new PhyloSummary(taxonomyFileName, ct);
taxaSum->summarize(tempTaxonomyFile);
}else {
if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); }
- taxaSum = new PhyloSummary(baseTName, groupMap);
+ taxaSum = new PhyloSummary(taxonomyFileName, groupMap);
if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; }
#include "phylotree.h"
#include "phylosummary.h"
#include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
//Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences
//into the New Bacterial Taxonomy†
//Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1*
Classify* myclassify;
if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); }
else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
+ else if(pDataArray->method == "zap"){
+ outputMethodTag = search + "_" + outputMethodTag;
+ if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+ else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); }
+ }
else {
pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
pDataArray->m->mothurOutEndLine();
--- /dev/null
+/*
+ * kmerNode.cpp
+ * bayesian
+ *
+ * Created by Pat Schloss on 10/11/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "kmernode.h"
+
+
+/**********************************************************************************************************************/
+
+KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) {
+ try {
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+
+ numPossibleKmers = power4s[kmerSize];
+ numUniqueKmers = 0;
+
+ kmerVector.assign(numPossibleKmers, 0);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "KmerNode");
+ exit(1);
+ }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::loadSequence(vector<int>& kmerProfile){
+ try {
+ for(int i=0;i<numPossibleKmers;i++){
+ if (m->control_pressed) { break; }
+ if(kmerVector[i] == 0 && kmerProfile[i] != 0) { numUniqueKmers++; }
+
+ kmerVector[i] += kmerProfile[i];
+ }
+
+ numSeqs++;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "loadSequence");
+ exit(1);
+ }
+}
+
+/**********************************************************************************************************************/
+
+string KmerNode::getKmerBases(int kmerNumber){
+ try {
+ // Here we convert the kmer number into the kmer in terms of bases.
+ //
+ // Example: Score = 915 (for a 6-mer)
+ // Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T]
+ // Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT]
+ // Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT]
+ // Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT]
+ // Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT]
+ // Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+
+ string kmer = "";
+
+ if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must
+ for(int i=0;i<kmerSize;i++){ // have had an N in it and so we'll just call it N x kmerSize
+ kmer += 'N';
+ }
+ }
+ else{
+ for(int i=0;i<kmerSize;i++){
+ if (m->control_pressed) { return kmer; }
+ int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder
+ if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ]
+ else if(nt == 1){ kmer = 'C' + kmer; }
+ else if(nt == 2){ kmer = 'G' + kmer; }
+ else if(nt == 3){ kmer = 'T' + kmer; }
+ }
+ }
+ return kmer;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "getKmerBases");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void KmerNode::addThetas(vector<int> newTheta, int newNumSeqs){
+ try {
+ for(int i=0;i<numPossibleKmers;i++){
+ if (m->control_pressed) { break; }
+ kmerVector[i] += newTheta[i];
+ }
+
+ // if(alignLength == 0){
+ // alignLength = (int)newTheta.size();
+ // theta.resize(alignLength);
+ // columnCounts.resize(alignLength);
+ // }
+ //
+ // for(int i=0;i<alignLength;i++){
+ // theta[i].A += newTheta[i].A; columnCounts[i] += newTheta[i].A;
+ // theta[i].T += newTheta[i].T; columnCounts[i] += newTheta[i].T;
+ // theta[i].G += newTheta[i].G; columnCounts[i] += newTheta[i].G;
+ // theta[i].C += newTheta[i].C; columnCounts[i] += newTheta[i].C;
+ // theta[i].gap += newTheta[i].gap; columnCounts[i] += newTheta[i].gap;
+ // }
+
+ numSeqs += newNumSeqs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "addThetas");
+ exit(1);
+ }
+}
+
+/**********************************************************************************************************************/
+
+int KmerNode::getNumUniqueKmers(){
+ try {
+ if(numUniqueKmers == 0){
+
+ for(int i=0;i<numPossibleKmers;i++){
+ if (m->control_pressed) { return numUniqueKmers; }
+ if(kmerVector[i] != 0){
+ numUniqueKmers++;
+ }
+
+ }
+
+ }
+
+ return numUniqueKmers;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "getNumUniqueKmers");
+ exit(1);
+ }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::printTheta(){
+ try {
+ m->mothurOut(name + "\n");
+ for(int i=0;i<numPossibleKmers;i++){
+ if(kmerVector[i] != 0){
+ m->mothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n");
+ }
+ }
+ m->mothurOutEndLine();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "printTheta");
+ exit(1);
+ }
+
+}
+/**************************************************************************************************/
+
+double KmerNode::getSimToConsensus(vector<int>& queryKmerProfile){
+ try {
+ double present = 0;
+
+ for(int i=0;i<numPossibleKmers;i++){
+ if (m->control_pressed) { return present; }
+ if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){
+ present++;
+ }
+ }
+
+ return present / double(queryKmerProfile.size() - kmerSize + 1);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "getSimToConsensus");
+ exit(1);
+ }
+}
+
+/**********************************************************************************************************************/
+
+double KmerNode::getPxGivenkj_D_j(vector<int>& queryKmerProfile) {
+ try {
+ double sumLogProb = 0.0000;
+ double alpha = 1.0 / (double)totalSeqs; //flat prior
+ // double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001; //non-flat prior
+
+ for(int i=0;i<numPossibleKmers;i++){
+ if (m->control_pressed) { return sumLogProb; }
+ if(queryKmerProfile[i] != 0){ //numUniqueKmers needs to be the value from Root;
+ sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha));
+ }
+
+ }
+ return sumLogProb;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerNode", "getPxGivenkj_D_j");
+ exit(1);
+ }
+
+}
+
+/**********************************************************************************************************************/
--- /dev/null
+#ifndef KMERNODE
+#define KMERNODE
+
+/*
+ * kmerNode.h
+ * bayesian
+ *
+ * Created by Pat Schloss on 10/11/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+#include "taxonomynode.h"
+
+/**********************************************************************************************************************/
+
+class KmerNode : public TaxonomyNode {
+
+public:
+ KmerNode(string, int, int);
+ void loadSequence(vector<int>&);
+ void printTheta();
+ double getPxGivenkj_D_j(vector<int>&);
+ double getSimToConsensus(vector<int>&);
+ void checkTheta(){};
+ void setNumUniqueKmers(int num) { numUniqueKmers = num; }
+ int getNumUniqueKmers();
+ void addThetas(vector<int>, int);
+ vector<int> getTheta() { return kmerVector; }
+
+
+private:
+ string getKmerBases(int);
+ int kmerSize; // value of k
+ int numPossibleKmers; // 4^kmerSize
+ int numUniqueKmers; // number of unique kmers seen in a group ~ O_kj
+ int numKmers; // number of kmers in a sequence
+ vector<int> kmerVector; // counts of kmers across all sequences in a node
+};
+
+/**********************************************************************************************************************/
+
+#endif
+
--- /dev/null
+//
+// kmerTree.cpp
+// pdsBayesian
+//
+// Created by Patrick Schloss on 4/3/12.
+// Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "kmernode.h"
+#include "kmertree.h"
+
+/**************************************************************************************************/
+
+KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){
+ try {
+ KmerNode* newNode = new KmerNode("Root", 0, kmerSize);
+ tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+ numPossibleKmers = power4s[kmerSize];
+
+ string refTaxonomy;
+
+ readTaxonomy(taxonomyFileName);
+
+ ifstream referenceFile;
+ m->openInputFile(referenceFileName, referenceFile);
+ bool error = false;
+ while(!referenceFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ Sequence seq(referenceFile); m->gobble(referenceFile);
+
+ if (seq.getName() != "") {
+ map<string, string>::iterator it = taxonomy.find(seq.getName());
+
+ if (it != taxonomy.end()) {
+ refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence
+ vector<int> kmerProfile = ripKmerProfile(seq.getUnaligned()); //convert to kmer vector
+ addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile);
+ }else {
+ m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+ }
+ }
+ }
+ referenceFile.close();
+
+ if (error) { m->control_pressed = true; }
+
+ numTaxa = (int)tree.size();
+ numLevels = 0;
+ for(int i=0;i<numTaxa;i++){
+ int level = tree[i]->getLevel();
+ if(level > numLevels){ numLevels = level; }
+ }
+ numLevels++;
+
+ aggregateThetas();
+
+ int dbSize = tree[0]->getNumSeqs();
+
+ for(int i=0;i<numTaxa;i++){
+ tree[i]->checkTheta();
+ tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers());
+ tree[i]->setTotalSeqs(dbSize);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "KmerTree");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+KmerTree::~KmerTree(){
+
+ for(int i=0;i<tree.size();i++){
+ delete tree[i];
+ }
+
+}
+/**********************************************************************************************************************/
+
+vector<int> KmerTree::ripKmerProfile(string sequence){
+ try {
+ // assume all input sequences are unaligned
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+
+ int nKmers = (int)sequence.length() - kmerSize + 1;
+
+ vector<int> kmerProfile(numPossibleKmers + 1, 0);
+
+ for(int i=0;i<nKmers;i++){
+
+ if (m->control_pressed) { break; }
+
+ int kmer = 0;
+ for(int j=0;j<kmerSize;j++){
+ if(toupper(sequence[j+i]) == 'A') { kmer += (0 * power4s[kmerSize-j-1]); }
+ else if(toupper(sequence[j+i]) == 'C') { kmer += (1 * power4s[kmerSize-j-1]); }
+ else if(toupper(sequence[j+i]) == 'G') { kmer += (2 * power4s[kmerSize-j-1]); }
+ else if(toupper(sequence[j+i]) == 'U') { kmer += (3 * power4s[kmerSize-j-1]); }
+ else if(toupper(sequence[j+i]) == 'T') { kmer += (3 * power4s[kmerSize-j-1]); }
+ else { kmer = power4s[kmerSize]; j = kmerSize; }
+ }
+ kmerProfile[kmer] = 1;
+ }
+
+ return kmerProfile;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "ripKmerProfile");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
+ try {
+ KmerNode* newNode;
+ string taxonName = "";
+ int treePosition = 0; // the root is element 0
+
+
+ int level = 1;
+
+ for(int i=0;i<taxonomy.length();i++){ // step through taxonomy string...
+
+ if (m->control_pressed) { break; }
+ if(taxonomy[i] == ';'){ // looking for semicolons...
+
+ if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already
+ // has a child with the new taxonName
+ if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that
+ else { // position in the tree
+ int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one...
+ tree[treePosition]->makeChild(taxonName, newChildIndex);
+
+ newNode = new KmerNode(taxonName, level, kmerSize);
+
+ newNode->setParent(treePosition);
+
+ tree.push_back(newNode);
+ treePosition = newChildIndex;
+ }
+
+ // sequence data to that node to update that node's theta - seems slow...
+ taxonName = ""; // clear out the taxon name that we will build as we look
+ level++;
+
+ } // for a semicolon
+ else{
+ taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon
+ }
+ }
+
+ tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "addTaxonomyToTree");
+ exit(1);
+ }
+
+}
+
+/**************************************************************************************************/
+
+int KmerTree::aggregateThetas(){
+ try {
+ vector<vector<int> > levelMatrix(numLevels+1);
+
+ for(int i=0;i<tree.size();i++){
+ if (m->control_pressed) { return 0; }
+ levelMatrix[tree[i]->getLevel()].push_back(i);
+ }
+
+ for(int i=numLevels-1;i>0;i--) {
+ if (m->control_pressed) { return 0; }
+
+ for(int j=0;j<levelMatrix[i].size();j++){
+
+ KmerNode* holder = tree[levelMatrix[i][j]];
+
+ tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "aggregateThetas");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::getMinRiskIndexKmer(vector<int>& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+ try {
+ int numProbs = (int)probabilities.size();
+
+ vector<double> G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever.
+ vector<double> risk(numProbs, 0);
+
+ for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+ if (m->control_pressed) { return 0; }
+ G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+ }
+
+ double minRisk = 1e6;
+ int minRiskIndex = 0;
+
+ for(int i=0;i<numProbs;i++){
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<numProbs;j++){
+ if(i != j){
+ risk[i] += probabilities[j] * G[j];
+ }
+ }
+
+ if(risk[i] < minRisk){
+ minRisk = risk[i];
+ minRiskIndex = i;
+ }
+ }
+
+ return minRiskIndex;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "getMinRiskIndexKmer");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+ try {
+ int finalLevel = (int)indices.size()-1;
+
+ for(int position=1;position<indices.size();position++){
+ if (m->control_pressed) { return 0; }
+ int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+ int actualParent = indices[position-1][maxIndices[position-1]];
+
+ if(predictedParent != actualParent){
+ finalLevel = position - 1;
+ return finalLevel;
+ }
+ }
+ return finalLevel;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "sanityCheck");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+string KmerTree::getTaxonomy(Sequence* thisSeq){
+ try {
+ string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = "";
+ string unalignedSeq = thisSeq->getUnaligned();
+
+ double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers());
+
+ vector<int> queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector
+
+ vector<vector<double> > pXgivenKj_D_j(numLevels);
+ vector<vector<int> > indices(numLevels);
+ for(int i=0;i<numLevels;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ pXgivenKj_D_j[i].push_back(logPOutlier);
+ indices[i].push_back(-1);
+ }
+
+ for(int i=0;i<numTaxa;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile));
+ indices[tree[i]->getLevel()].push_back(i);
+ }
+
+ vector<double> sumLikelihood(numLevels, 0);
+ vector<double> bestPosterior(numLevels, 0);
+ vector<int> maxIndex(numLevels, 0);
+ int maxPosteriorIndex;
+
+ //let's find the best level and taxa within that level
+ for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+ if (m->control_pressed) { return taxonProbabilityString; }
+
+ int numTaxaInLevel = (int)indices[i].size();
+
+ vector<double> posteriors(numTaxaInLevel, 0);
+ sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+
+ maxPosteriorIndex = 0;
+ for(int j=0;j<numTaxaInLevel;j++){
+ posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+ if(posteriors[j] > posteriors[maxPosteriorIndex]){
+ maxPosteriorIndex = j;
+ }
+
+ }
+
+ maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors);
+
+ maxIndex[i] = maxPosteriorIndex;
+ bestPosterior[i] = posteriors[maxIndex[i]];
+ }
+
+ // vector<double> pX_level(numLevels, 0);
+ //
+ // for(int i=0;i<numLevels;i++){
+ // pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+ // }
+ //
+ // int max_pLevel_X_index = -1;
+ // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+ // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+ //
+ // vector<double> pLevel_X(numLevels, 0);
+ // for(int i=0;i<numLevels;i++){
+ // pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+ // }
+
+ int saneDepth = sanityCheck(indices, maxIndex);
+
+
+ // stringstream levelProbabilityOutput;
+ // levelProbabilityOutput.setf(ios::fixed, ios::floatfield);
+ // levelProbabilityOutput.setf(ios::showpoint);
+
+
+ //taxonProbabilityOutput << seqName << '\t';
+ // taxonProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+ // levelProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+ simpleTax = "";
+ int savedspot = 1;
+ taxonProbabilityString = "";
+ for(int i=1;i<=saneDepth;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ int confidenceScore = (int) (bestPosterior[i] * 100);
+ if (confidenceScore >= confidenceThreshold) {
+ if(indices[i][maxIndex[i]] != -1){
+ taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");";
+ simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+
+ // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+ }
+ else{
+ taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");";
+ // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+ simpleTax += "unclassified;";
+ }
+ }else { break; }
+ savedspot = i;
+ }
+
+
+
+ for(int i=savedspot+1;i<numLevels;i++){
+ if (m->control_pressed) { return taxonProbabilityString; }
+ taxonProbabilityString += "unclassified(0);";
+ simpleTax += "unclassified;";
+ }
+
+ return taxonProbabilityString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "KmerTree", "getTaxonomy");
+ exit(1);
+ }
+}
+
+
+/**************************************************************************************************/
+
--- /dev/null
+//
+// kmerTree.h
+// pdsBayesian
+//
+// Created by Patrick Schloss on 4/3/12.
+// Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_kmerTree_h
+#define pdsBayesian_kmerTree_h
+
+#include "classify.h"
+
+class KmerNode;
+
+class KmerTree : public Classify {
+
+public:
+ KmerTree(string, string, int, int);
+ ~KmerTree();
+
+ string getTaxonomy(Sequence*);
+
+private:
+ int addTaxonomyToTree(string, string, vector<int>&);
+ vector<int> ripKmerProfile(string);
+ int getMinRiskIndexKmer(vector<int>&, vector<int>&, vector<double>&);
+ int aggregateThetas();
+ int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+ int kmerSize;
+ int numPossibleKmers, confidenceThreshold;
+ vector<KmerNode*> tree;
+
+};
+
+#endif
--- /dev/null
+/*
+ * taxonomynode.cpp
+ *
+ *
+ * Created by Pat Schloss on 7/8/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){
+ m = MothurOut::getInstance();
+ parent = -1;
+ numChildren = 0;
+ numSeqs = 0;
+}
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setName(string n) { name = n; }
+
+/**************************************************************************************************/
+
+string TaxonomyNode::getName() { return name; }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setParent(int p) { parent = p; }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getParent() { return parent; }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::makeChild(string c, int i) { children[c] = i; }
+
+
+/**************************************************************************************************/
+
+map<string, int> TaxonomyNode::getChildren() { return children; }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getChildIndex(string c){
+ map<string, int>::iterator it = children.find(c);
+ if(it != children.end()) { return it->second; }
+ else { return -1; }
+}
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getNumKids() { return (int)children.size(); }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getNumSeqs() { return numSeqs; }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setTotalSeqs(int n) { totalSeqs = n; }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getLevel() { return level; }
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef TAXONOMYNODE
+#define TAXONOMYNODE
+
+/*
+ * taxonomynode.h
+ *
+ *
+ * Created by Pat Schloss on 7/8/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "mothurout.h"
+/**************************************************************************************************/
+
+class TaxonomyNode {
+
+public:
+ TaxonomyNode();
+ TaxonomyNode(string, int);
+ void setName(string);
+ string getName();
+
+
+ void setParent(int);
+ int getParent();
+
+ void makeChild(string, int);
+ map<string, int> getChildren();
+ int getChildIndex(string);
+ int getNumKids();
+ int getNumSeqs();
+ void setTotalSeqs(int);
+ int getLevel();
+
+private:
+ int parent;
+ map<string, int> children;
+ int numChildren;
+ int level;
+
+protected:
+ MothurOut* m;
+ int numSeqs;
+ int totalSeqs;
+ string name;
+};
+
+/**************************************************************************************************/
+
+#endif