]> git.donarmstrong.com Git - mothur.git/commitdiff
added zap method to classify.seqs and changed bayesian method name to wang.
authorSarah Westcott <mothur.westcott@gmail.com>
Wed, 3 Oct 2012 18:51:10 +0000 (14:51 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Wed, 3 Oct 2012 18:51:10 +0000 (14:51 -0400)
15 files changed:
Mothur.xcodeproj/project.pbxproj
alignnode.cpp [new file with mode: 0755]
alignnode.h [new file with mode: 0755]
aligntree.cpp [new file with mode: 0755]
aligntree.h [new file with mode: 0755]
classify.cpp
classify.h
classifyseqscommand.cpp
classifyseqscommand.h
kmernode.cpp [new file with mode: 0755]
kmernode.h [new file with mode: 0755]
kmertree.cpp [new file with mode: 0755]
kmertree.h [new file with mode: 0755]
taxonomynode.cpp [new file with mode: 0755]
taxonomynode.h [new file with mode: 0755]

index 9d6261b1f3c6a6abab8dc675a45b648cc4b73093..b2895e8b73839ec48b795ab0877b0e9aad0eb330 100644 (file)
                A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
                A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
                A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
+               A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; };
+               A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; };
+               A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; };
+               A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; };
+               A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; };
                A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; };
                A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
                A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; };
                A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = "<group>"; };
                A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = "<group>"; };
                A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = "<group>"; };
+               A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = "<group>"; };
+               A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = "<group>"; };
+               A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = "<group>"; };
+               A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = "<group>"; };
+               A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = "<group>"; };
+               A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = "<group>"; };
+               A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = "<group>"; };
+               A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = "<group>"; };
+               A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = "<group>"; };
+               A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = "<group>"; };
                A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = "<group>"; };
                A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = "<group>"; };
                A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
                A7E9BA4B12D3966900DA6239 /* classifier */ = {
                        isa = PBXGroup;
                        children = (
-                               A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
+                               A721AB67161C570F009860A1 /* alignnode.h */,
+                               A721AB66161C570F009860A1 /* alignnode.cpp */,
+                               A721AB69161C570F009860A1 /* aligntree.h */,
+                               A721AB68161C570F009860A1 /* aligntree.cpp */,
                                A7E9B65B12D37EC300DA6239 /* bayesian.h */,
+                               A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
                                A7E9B68E12D37EC400DA6239 /* classify.cpp */,
                                A7E9B68F12D37EC400DA6239 /* classify.h */,
+                               A721AB6E161C572A009860A1 /* kmernode.h */,
+                               A721AB6D161C572A009860A1 /* kmernode.cpp */,
+                               A721AB70161C572A009860A1 /* kmertree.h */,
+                               A721AB6F161C572A009860A1 /* kmertree.cpp */,
                                A7E9B73812D37EC400DA6239 /* knn.h */,
                                A7E9B73712D37EC400DA6239 /* knn.cpp */,
                                A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */,
                                A7E9B79012D37EC400DA6239 /* phylotree.h */,
                                A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */,
                                A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */,
+                               A721AB74161C573B009860A1 /* taxonomynode.h */,
+                               A721AB73161C573B009860A1 /* taxonomynode.cpp */,
                        );
                        name = classifier;
                        sourceTree = "<group>";
                                A7386C29161A110800651424 /* decisiontree.cpp in Sources */,
                                A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */,
                                A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */,
+                               A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */,
+                               A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */,
+                               A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
+                               A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
+                               A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/alignnode.cpp b/alignnode.cpp
new file mode 100755 (executable)
index 0000000..ccd8fb0
--- /dev/null
@@ -0,0 +1,257 @@
+/*
+ *  alignNode.cpp
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "alignNode.h"
+#include "taxonomynode.h"
+
+#include "bayesian.h"
+
+/**************************************************************************************************/
+
+AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){
+
+       alignLength = 0;
+}
+
+/**************************************************************************************************/
+
+void AlignNode::printTheta(){
+    try {
+        m->mothurOut("A:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].A)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("T:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].T)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("G:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].G)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("C:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].C)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("I:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].gap)+ '\t');             }       m->mothurOutEndLine();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "printTheta");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::loadSequence(string& sequence){
+       try {
+        alignLength = (int)sequence.length();  //      this function runs through the alignment and increments the frequency
+        //     of each base for a particular taxon.  we are building the thetas
+        
+        if(theta.size() == 0){
+            theta.resize(alignLength);
+            columnCounts.resize(alignLength, 0);
+        }
+        
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return 0; }
+            
+            char base = sequence[i];
+            
+            if(base == 'A')            {       theta[i].A++;   columnCounts[i]++;              }       //      our thetas will be alignLength x 5  
+            else if(base == 'T'){      theta[i].T++;   columnCounts[i]++;              }       //      and we ignore any position that has  
+            else if(base == 'G'){      theta[i].G++;   columnCounts[i]++;              }       //      an ambiguous base call
+            else if(base == 'C'){      theta[i].C++;   columnCounts[i]++;              }
+            else if(base == '-'){      theta[i].gap++; columnCounts[i]++;              }
+            else if(base == 'U'){      theta[i].T++;   columnCounts[i]++;              }
+        }
+        
+        numSeqs++;
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "loadSequence");
+               exit(1);
+       }
+}      
+
+/**************************************************************************************************/
+
+int AlignNode::checkTheta(){
+    try {
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return 0; }
+            
+            if(theta[i].gap == columnCounts[i]){
+                columnCounts[i] = 0;
+            }
+            //        else{
+            //            int maxCount = theta[i].A;
+            //            
+            //            if(theta[i].T > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].G > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].C > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].gap > maxCount) {    maxCount = theta[i].T;  }
+            //        
+            //            if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){   //remove any column where the maximum frequency is <50%
+            //                columnCounts[i] = 0;
+            //            }
+            //        }
+            
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "checkTheta");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::addThetas(vector<thetaAlign> newTheta, int newNumSeqs){
+       try {
+        if(alignLength == 0){
+            alignLength = (int)newTheta.size();
+            theta.resize(alignLength);
+            columnCounts.resize(alignLength);
+        }
+        
+        for(int i=0;i<alignLength;i++){        
+            
+            if (m->control_pressed) { return 0; }
+            
+            theta[i].A += newTheta[i].A;               columnCounts[i] += newTheta[i].A;
+            theta[i].T += newTheta[i].T;               columnCounts[i] += newTheta[i].T;
+            theta[i].G += newTheta[i].G;               columnCounts[i] += newTheta[i].G;
+            theta[i].C += newTheta[i].C;               columnCounts[i] += newTheta[i].C;
+            theta[i].gap += newTheta[i].gap;   columnCounts[i] += newTheta[i].gap;
+        }
+        
+        numSeqs += newNumSeqs;
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "addThetas");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getSimToConsensus(string& query){
+       try {
+        double similarity = 0;
+        
+        int length = 0;
+        
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return similarity; }
+            
+            char base = query[i];
+            
+            if(base != '.' && base != 'N' && columnCounts[i] != 0){
+                
+                double fraction = 0;
+                
+                if(base == 'A'){
+                    fraction = (int) theta[i].A / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'T'){
+                    fraction = (int) theta[i].T / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'G'){
+                    fraction = (int) theta[i].G / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'C'){
+                    fraction = (int) theta[i].C / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == '-'){
+                    fraction = (int) theta[i].gap / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+            }
+        }
+        
+        if(length != 0){
+            similarity /= double(length);
+        }
+        else {
+            similarity = 0;
+        }
+        
+        return similarity;     
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignNode", "getSimToConsensus");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getPxGivenkj_D_j(string& query){     //P(x | k_j, D, j)
+       try {
+        double PxGivenkj_D_j = 0;
+        
+        int count = 0;
+        double alpha = 1 / (double)totalSeqs;  //flat prior
+        
+        
+        for(int s=0;s<alignLength;s++){
+            
+            if (m->control_pressed) { return PxGivenkj_D_j; }
+            
+            char base = query[s];
+            thetaAlign thetaS = theta[s];
+            
+            if(base != '.' && base != 'N' && columnCounts[s] != 0){
+                double Nkj_s = (double)columnCounts[s];        
+                double nkj_si = 0;
+                
+                
+                if(base == 'A')                {       nkj_si = (double)thetaS.A;              }
+                else if(base == 'T'){  nkj_si = (double)thetaS.T;              }       
+                else if(base == 'G'){  nkj_si = (double)thetaS.G;              }       
+                else if(base == 'C'){  nkj_si = (double)thetaS.C;              }       
+                else if(base == '-'){  nkj_si = (double)thetaS.gap;    }
+                else if(base == 'U'){  nkj_si = (double)thetaS.T;              }       
+                
+                //                     double alpha = pow(0.2, double(Nkj_s)) + 0.0001;        //need to make 1e-4 a variable in future; this is the non-flat prior
+                
+                //                     if(columnCounts[s] != nkj_si){                                          //deal only with segregating sites...
+                               double numerator = nkj_si + alpha;
+                               double denomenator = Nkj_s + 5.0 * alpha;
+                               
+                               PxGivenkj_D_j += log(numerator) - log(denomenator);             
+                               count++;
+                //                     }
+            }
+            if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){
+                count = 0;
+                break;
+            }
+            
+        }
+        
+        if(count == 0){        PxGivenkj_D_j = -1e10;  }       
+        
+        return PxGivenkj_D_j;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignNode", "getPxGivenkj_D_j");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
diff --git a/alignnode.h b/alignnode.h
new file mode 100755 (executable)
index 0000000..4aecca7
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef ALIGNNODE
+#define ALIGNNODE
+
+/*
+ *  alignNode.h
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+struct thetaAlign {
+       thetaAlign() : A(0), T(0), G(0), C(0), gap(0){}
+       unsigned int A;
+       unsigned int T;
+       unsigned int G;
+       unsigned int C;
+       unsigned int gap;
+};
+
+/**************************************************************************************************/
+
+class AlignNode : public TaxonomyNode {
+       
+public:
+       AlignNode(string, int);
+       int loadSequence(string&);
+       int checkTheta();
+    void printTheta();
+       double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j)
+       double getSimToConsensus(string& query);
+       vector<thetaAlign> getTheta()   {       return theta;   }
+       int addThetas(vector<thetaAlign>, int);
+       
+private:
+       vector<thetaAlign> theta;
+       vector<unsigned int> columnCounts;
+       int alignLength;
+};
+
+/**************************************************************************************************/
+
+#endif
+
diff --git a/aligntree.cpp b/aligntree.cpp
new file mode 100755 (executable)
index 0000000..41667ca
--- /dev/null
@@ -0,0 +1,371 @@
+//
+//  alignTree.cpp
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "alignnode.h"
+#include "aligntree.h"
+
+/**************************************************************************************************/
+
+AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
+       try {
+        AlignNode* newNode = new AlignNode("Root", 0);
+        tree.push_back(newNode);                       //      the tree is stored as a vector of elements of type TaxonomyNode
+        
+        string refTaxonomy;
+        
+        readTaxonomy(taxonomyFileName);
+     
+        ifstream referenceFile;
+        m->openInputFile(referenceFileName, referenceFile);
+        bool error = false;
+        map<int, int> lengths;
+        while(!referenceFile.eof()){
+            
+            if (m->control_pressed) { break; }
+            
+            Sequence seq(referenceFile);  m->gobble(referenceFile);
+            
+            if (seq.getName() != "") {
+                map<string, string>::iterator it = taxonomy.find(seq.getName());
+                
+                if (it != taxonomy.end()) {
+                    refTaxonomy = it->second;          //      lookup the taxonomy string for the current reference sequence
+                    string aligned = seq.getAligned();
+                    lengths[aligned.length()] = 1;
+                    if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
+                    addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
+                }else {
+                    m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+                }
+            }
+        }
+        referenceFile.close();
+        
+        length = (lengths.begin())->first;  
+           
+        if (error) { m->control_pressed = true; }
+        
+        numTaxa = (int)tree.size();
+        
+        numLevels = 0;
+        for(int i=0;i<numTaxa;i++){
+            int level = tree[i]->getLevel();
+            if(level > numLevels){     numLevels = level;      }
+        }
+        numLevels++;
+        
+        aggregateThetas();
+        
+        int dbSize = tree[0]->getNumSeqs();
+        
+        for(int i=0;i<numTaxa;i++){
+            tree[i]->checkTheta();
+            tree[i]->setTotalSeqs(dbSize);
+        }
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "AlignTree");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+AlignTree::~AlignTree(){
+       try {
+        for(int i=0;i<tree.size();i++){
+            delete tree[i];
+        }
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "~AlignTree");
+        exit(1);
+    }
+}      
+
+/**************************************************************************************************/
+
+int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
+       try {
+        AlignNode* newNode;
+        string taxonName = "";
+        int treePosition = 0;                                                  //      the root is element 0
+        
+        int level = 1;
+        
+        for(int i=0;i<taxonomy.length();i++){                  //      step through taxonomy string...
+            
+            if (m->control_pressed) { break; }
+            
+            if(taxonomy[i] == ';'){                                            //      looking for semicolons...
+                
+                if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+                
+                int newIndex = tree[treePosition]->getChildIndex(taxonName);   //      look to see if your current node already
+                //     has a child with the new taxonName
+                if(newIndex != -1)     {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
+                else {                                                                                                         //       position in the tree
+                    int newChildIndex = (int)tree.size();                                              //      otherwise, we'll have to create one...
+                    tree[treePosition]->makeChild(taxonName, newChildIndex);
+                    
+                    newNode = new AlignNode(taxonName, level);
+                    
+                    newNode->setParent(treePosition);
+                    
+                    tree.push_back(newNode);
+                    treePosition = newChildIndex;
+                }
+                
+                //     sequence data to that node to update that node's theta - seems slow...                          
+                taxonName = "";                                                                //      clear out the taxon name that we will build as we look 
+                level++;
+            }                                                                                          //      for a semicolon
+            else{
+                taxonName += taxonomy[i];                                      //      keep adding letters until we reach a semicolon
+            }
+        }
+        tree[treePosition]->loadSequence(sequence);    //      now that we've gotten to the correct node, add the
+        
+        return 0;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "addTaxonomyToTree");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::aggregateThetas(){
+       try {
+        vector<vector<int> > levelMatrix(numLevels+1);
+        
+        for(int i=0;i<tree.size();i++){
+            if (m->control_pressed) { return 0; }
+            levelMatrix[tree[i]->getLevel()].push_back(i);
+        }
+               
+        for(int i=numLevels-1;i>0;i--){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<levelMatrix[i].size();j++){
+                
+                AlignNode* holder = tree[levelMatrix[i][j]];
+                
+                tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                                
+            }
+        }
+           return 0;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "aggregateThetas");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+double AlignTree::getOutlierLogProbability(string& sequence){
+       try {
+        double count = 0;
+        
+        for(int i=0;i<sequence.length();i++){
+            
+            if(sequence[i] != '.'){    count++;        }
+            
+        }
+        
+        return count * log(0.2);
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getOutlierLogProbability");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+       try {
+        int numProbs = (int)probabilities.size();
+        
+        vector<double> G(numProbs, 0.2);       //a random sequence will, on average, be 20% similar to any other sequence
+        vector<double> risk(numProbs, 0);
+        
+        for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+            if (m->control_pressed) { return 0; }
+            G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+        }
+        
+        double minRisk = 1e6;
+        int minRiskIndex = 0;
+        
+        for(int i=0;i<numProbs;i++){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<numProbs;j++){
+                if(i != j){
+                    risk[i] += probabilities[j] * G[j];
+                }                      
+            }
+            
+            if(risk[i] < minRisk){
+                minRisk = risk[i];
+                minRiskIndex = i;
+            }
+        }
+        
+        return minRiskIndex;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
+        exit(1);
+    }
+
+}
+
+/**************************************************************************************************/
+
+int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+       try {
+        int finalLevel = (int)indices.size()-1;
+        
+        for(int position=1;position<indices.size();position++){
+            if (m->control_pressed) { return 0; }
+            int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+            int actualParent = indices[position-1][maxIndices[position-1]];
+            
+            if(predictedParent != actualParent){
+                finalLevel = position - 1;
+                return finalLevel;
+            }
+        }
+        return finalLevel;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "sanityCheck");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+string AlignTree::getTaxonomy(Sequence* seq){
+    try {
+        string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
+        if (querySequence.length() != length) {
+            m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return "";
+        }
+        double logPOutlier = getOutlierLogProbability(querySequence);
+        
+        vector<vector<double> > pXgivenKj_D_j(numLevels);
+        vector<vector<int> > indices(numLevels);
+        for(int i=0;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[i].push_back(logPOutlier);
+            indices[i].push_back(-1);
+        }
+        
+        
+        for(int i=0;i<numTaxa;i++){
+            //         cout << i << '\t' << tree[i]->getName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl;
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
+            indices[tree[i]->getLevel()].push_back(i);
+        }
+        
+        vector<double> sumLikelihood(numLevels, 0);
+        vector<double> bestPosterior(numLevels, 0);
+        vector<int> maxIndex(numLevels, 0);
+        int maxPosteriorIndex;
+        
+        
+               //cout << "before best level" << endl;
+        
+        //let's find the best level and taxa within that level
+        for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int numTaxaInLevel = (int)indices[i].size();
+            
+                       //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl;
+            
+            vector<double> posteriors(numTaxaInLevel, 0);              
+            sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+            
+            maxPosteriorIndex = 0;
+            for(int j=0;j<numTaxaInLevel;j++){
+                posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+                
+                if(posteriors[j] > posteriors[maxPosteriorIndex]){     
+                    maxPosteriorIndex = j;
+                }
+                
+            }
+            
+            maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
+            
+            maxIndex[i] = maxPosteriorIndex;
+            bestPosterior[i] = posteriors[maxIndex[i]];        
+        }
+        
+        //     vector<double> pX_level(numLevels, 0);
+        //     
+        //     for(int i=0;i<numLevels;i++){
+        //             pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+        //     }
+        //     
+        //     int max_pLevel_X_index = -1;
+        //     double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+        //     double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+        //     
+        //     vector<double> pLevel_X(numLevels, 0);
+        //     for(int i=0;i<numLevels;i++){
+        //             pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+        //     }
+        
+        
+        
+        
+        int saneDepth = sanityCheck(indices, maxIndex);
+        
+        simpleTax = "";
+        int savedspot = 1;
+        taxonProbabilityString = "";
+        for(int i=1;i<=saneDepth;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int confidenceScore = (int) (bestPosterior[i] * 100);
+            if (confidenceScore >= confidenceThreshold) {
+            if(indices[i][maxIndex[i]] != -1){
+                taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
+                simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+                //                     levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+            }
+            else{
+                taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
+                //                     levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+                simpleTax += "unclassified;";
+            }
+            }else { break; }
+            savedspot = i;
+        }
+        
+        for(int i=savedspot+1;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            taxonProbabilityString + "unclassified(0);";
+            simpleTax += "unclassified;";
+        }
+        
+        return taxonProbabilityString;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getTaxonomy");
+        exit(1);
+    }
+}
+
+
+/**************************************************************************************************/
diff --git a/aligntree.h b/aligntree.h
new file mode 100755 (executable)
index 0000000..51008ff
--- /dev/null
@@ -0,0 +1,34 @@
+//
+//  alignTree.h
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_alignTree_h
+#define pdsBayesian_alignTree_h
+
+#include "classify.h"
+
+class AlignNode;
+
+class AlignTree : public Classify {
+
+public:
+       AlignTree(string, string, int);
+       ~AlignTree();
+       string getTaxonomy(Sequence*);
+       
+private:
+    int addTaxonomyToTree(string, string&, string&);
+       double getOutlierLogProbability(string&);
+       int getMinRiskIndexAlign(string&, vector<int>&, vector<double>&);
+       int aggregateThetas();
+       int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+       int numSeqs, confidenceThreshold, length;
+       vector<AlignNode*> tree;
+};
+
+#endif
index 459c90f9ddd769628b0fc9061e7bd504a88b2900..8aa3cdb381ed7a389667ce61962d47cefac15ddd 100644 (file)
@@ -354,3 +354,37 @@ vector<string> Classify::parseTax(string tax) {
 }
 /**************************************************************************************************/
 
+double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
+       try {
+        //     http://jblevins.org/notes/log-sum-exp
+        
+        double maxProb = probabilities[0];
+        maxIndex = 0;
+        
+        int numProbs = (int)probabilities.size();
+        
+        for(int i=1;i<numProbs;i++){
+            if(probabilities[i] >= maxProb){
+                maxProb = probabilities[i];
+                maxIndex = i;
+            }
+        }
+        
+        double probSum = 0.0000;
+        
+        for(int i=0;i<numProbs;i++){
+            probSum += exp(probabilities[i] - maxProb);                
+        }
+        
+        probSum = log(probSum) + maxProb;
+        
+        return probSum;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "getLogExpSum");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
index 6582be48d12bcaee868c3d64865a665eff073c42..7b4c1022fef35aacbc33824d99ede551e55b187c 100644 (file)
 #include "database.hpp"
 #include "phylotree.h"
 
-
 class Sequence;
 
-
 /**************************************************************************************************/
 
 class Classify {
@@ -37,7 +35,6 @@ public:
 protected:
 
        map<string, string> taxonomy;  //name maps to taxonomy
-       //map<string, int> genusCount;  //maps genus to count - in essence a list of how many seqs are in each taxonomy
        map<string, int>::iterator itTax;
        map<string, string>::iterator it;
        Database* database;
@@ -45,11 +42,12 @@ protected:
        
        string taxFile, templateFile, simpleTax;
        vector<string> names;
-       int threadID;
+       int threadID, numLevels, numTaxa;
        bool flip, flipped, shortcuts;
        
        int readTaxonomy(string);
        vector<string> parseTax(string);
+    double getLogExpSum(vector<double>, int&);
        MothurOut* m;
        
 };
index 2e55673934a03b4c0ba5984f930499ec86e1e165..36bccc2b205635c6b1b0df872332534d615e3bb1 100644 (file)
@@ -21,9 +21,9 @@ vector<string> ClassifySeqsCommand::setParameters(){
         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
                CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
 
-               CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
+               CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch);
                CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
-               CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
+               CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
                CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
@@ -55,11 +55,11 @@ string ClassifySeqsCommand::getHelpString(){
                helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
                helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
                helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
-               helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
+               helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n";
                helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
                helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
         helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
-               helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
+               helpString += "The method parameter allows you to specify classification method to use.  Your options are: wang, knn and zap. The default is wang.\n";
                helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
                helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
 #ifdef USE_MPI
@@ -72,8 +72,8 @@ string ClassifySeqsCommand::getHelpString(){
                helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
                helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
                helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
-               helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
-               helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
+               helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n";
+               helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method.  The default is 100.\n";
                //helpString += "The flip parameter allows you shut off mothur's   The default is T.\n";
                helpString += "The classify.seqs command should be in the following format: \n";
                helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
@@ -492,9 +492,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        string temp;
-                       temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
-                       m->mothurConvert(temp, kmerSize); 
-                       
                        temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                        m->setProcessors(temp);
                        m->mothurConvert(temp, processors); 
@@ -536,7 +533,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                        
                        search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
                        
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "wang";        }
+            
+            temp = validParameter.validFile(parameters, "ksize", false);               if (temp == "not found"){       
+                temp = "8";    
+                if (method == "zap") { temp = "7"; }
+            }
+                       m->mothurConvert(temp, kmerSize); 
                        
                        temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
                        m->mothurConvert(temp, match);  
@@ -570,8 +573,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                        m->mothurConvert(temp, iters); 
 
                        
-                       if ((method == "bayesian") && (search != "kmer"))  { 
-                               m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
+                       if ((method == "wang") && (search != "kmer"))  { 
+                               m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
+                               search = "kmer";
+                       }
+            
+            if ((method == "zap") && ((search != "kmer") && (search != "align")))  { 
+                               m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
                                search = "kmer";
                        }
                        
@@ -605,10 +613,16 @@ int ClassifySeqsCommand::execute(){
        try {
                if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
         
-               if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);             }
+        string outputMethodTag = method + ".";
+               if(method == "wang"){   classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);     }
                else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand());                               }
+        else if(method == "zap"){      
+            outputMethodTag = search + "_" + outputMethodTag;
+            if (search == "kmer") {   classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); }
+            else {  classify = new AlignTree(templateFileName, taxonomyFileName, cutoff);  }
+        }
                else {
-                       m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
+                       m->mothurOut(search + " is not a valid method option. I will run the command using wang.");
                        m->mothurOutEndLine();
                        classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);     
                }
@@ -633,10 +647,10 @@ int ClassifySeqsCommand::execute(){
             if (RippedTaxName != "") { RippedTaxName +=  "."; }   
           
                        if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
-                       string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy");
-                       string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos");
+                       string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy");
+                       string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos");
                        string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
-                       string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary");
+                       string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary");
                        
                        if ((method == "knn") && (search == "distance")) { 
                                string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist");
@@ -798,12 +812,12 @@ int ClassifySeqsCommand::execute(){
                 if (hasCount) { 
                     ct = new CountTable();
                     ct->readTable(countfileNames[s]);
-                    taxaSum = new PhyloSummary(baseTName, ct);
+                    taxaSum = new PhyloSummary(taxonomyFileName, ct);
                     taxaSum->summarize(tempTaxonomyFile);
                 }else {
                     if (groupfile != "") {  group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); }
                     
-                    taxaSum = new PhyloSummary(baseTName, groupMap);
+                    taxaSum = new PhyloSummary(taxonomyFileName, groupMap);
                     
                     if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
                     
index 6d11d9236778028e326c6f4fdd4925dd6430fd50..0cffec6948a83849a078fbef99b81cb5fe7b4433 100644 (file)
 #include "phylotree.h"
 #include "phylosummary.h"
 #include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
 
 
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
 //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences 
 //into the New Bacterial Taxonomy􏰎† 
 //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* 
@@ -166,6 +168,11 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
                Classify* myclassify;
                if(pDataArray->method == "bayesian"){   myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);             }
                else if(pDataArray->method == "knn"){   myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);                           }
+        else if(pDataArray->method == "zap"){  
+            outputMethodTag = search + "_" + outputMethodTag;
+            if (pDataArray->search == "kmer") {   myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+            else {  myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff);  }
+        }
                else {
                        pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
                        pDataArray->m->mothurOutEndLine();
diff --git a/kmernode.cpp b/kmernode.cpp
new file mode 100755 (executable)
index 0000000..c087cac
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ *  kmerNode.cpp
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "kmernode.h"
+
+
+/**********************************************************************************************************************/
+
+KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) {
+       try {
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        numPossibleKmers = power4s[kmerSize];
+        numUniqueKmers = 0;
+        
+        kmerVector.assign(numPossibleKmers, 0);
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "KmerNode");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::loadSequence(vector<int>& kmerProfile){
+       try {
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { break; }
+            if(kmerVector[i] == 0 && kmerProfile[i] != 0)      {       numUniqueKmers++;       }
+            
+            kmerVector[i] += kmerProfile[i];
+        }
+        
+        numSeqs++;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "loadSequence");
+               exit(1);
+       }
+}      
+
+/**********************************************************************************************************************/
+
+string KmerNode::getKmerBases(int kmerNumber){
+       try {
+        //     Here we convert the kmer number into the kmer in terms of bases.
+        //
+        //     Example:        Score = 915 (for a 6-mer)
+        //                             Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T      [T]
+        //                             Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A      [AT]
+        //                             Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C       [CAT]
+        //                             Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G       [GCAT]
+        //                             Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T        [TGCAT]
+        //                             Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A        [ATGCAT] -> this checks out with the previous method
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        string kmer = "";
+        
+        if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){      //      if the kmer number is the same as the maxKmer then it must
+            for(int i=0;i<kmerSize;i++){                                       //      have had an N in it and so we'll just call it N x kmerSize
+                kmer += 'N';
+            }
+        }
+        else{
+            for(int i=0;i<kmerSize;i++){
+                if (m->control_pressed) { return kmer; }
+                int nt = (int)(kmerNumber / (float)power4s[i]) % 4;            //      the '%' operator returns the remainder 
+                if(nt == 0)            {       kmer = 'A' + kmer;      }                               //      from int-based division ]
+                else if(nt == 1){      kmer = 'C' + kmer;      }
+                else if(nt == 2){      kmer = 'G' + kmer;      }
+                else if(nt == 3){      kmer = 'T' + kmer;      }
+            }
+        }
+        return kmer;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getKmerBases");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+void KmerNode::addThetas(vector<int> newTheta, int newNumSeqs){
+       try {
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { break; }
+            kmerVector[i] += newTheta[i];              
+        }
+        
+        //     if(alignLength == 0){
+        //             alignLength = (int)newTheta.size();
+        //             theta.resize(alignLength);
+        //             columnCounts.resize(alignLength);
+        //     }
+        //     
+        //     for(int i=0;i<alignLength;i++){ 
+        //             theta[i].A += newTheta[i].A;            columnCounts[i] += newTheta[i].A;
+        //             theta[i].T += newTheta[i].T;            columnCounts[i] += newTheta[i].T;
+        //             theta[i].G += newTheta[i].G;            columnCounts[i] += newTheta[i].G;
+        //             theta[i].C += newTheta[i].C;            columnCounts[i] += newTheta[i].C;
+        //             theta[i].gap += newTheta[i].gap;        columnCounts[i] += newTheta[i].gap;
+        //     }
+        
+        numSeqs += newNumSeqs;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "addThetas");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+int KmerNode::getNumUniqueKmers(){
+    try {
+        if(numUniqueKmers == 0){
+            
+            for(int i=0;i<numPossibleKmers;i++){
+                if (m->control_pressed) { return numUniqueKmers; }
+                if(kmerVector[i] != 0){
+                    numUniqueKmers++;
+                }
+                
+            }
+            
+        }
+        
+        return numUniqueKmers; 
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getNumUniqueKmers");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::printTheta(){
+       try {
+        m->mothurOut(name + "\n");
+        for(int i=0;i<numPossibleKmers;i++){
+            if(kmerVector[i] != 0){
+                m->mothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n");
+            }
+        }
+        m->mothurOutEndLine(); 
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "printTheta");
+               exit(1);
+       }
+       
+}
+/**************************************************************************************************/
+
+double KmerNode::getSimToConsensus(vector<int>& queryKmerProfile){
+       try {
+        double present = 0;
+        
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { return present; }
+            if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){
+                present++;
+            }
+        }      
+        
+        return present / double(queryKmerProfile.size() - kmerSize + 1);
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getSimToConsensus");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+double KmerNode::getPxGivenkj_D_j(vector<int>& queryKmerProfile)       {       
+       try {
+        double sumLogProb = 0.0000;
+        double alpha = 1.0 / (double)totalSeqs;        //flat prior
+        //     double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001;     //non-flat prior
+        
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { return sumLogProb; }
+            if(queryKmerProfile[i] != 0){              //numUniqueKmers needs to be the value from Root;
+                sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha));
+            }
+            
+        }
+        return sumLogProb;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getPxGivenkj_D_j");
+               exit(1);
+       }
+
+}
+
+/**********************************************************************************************************************/
diff --git a/kmernode.h b/kmernode.h
new file mode 100755 (executable)
index 0000000..e15fb1d
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef KMERNODE
+#define KMERNODE
+
+/*
+ *  kmerNode.h
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+#include "taxonomynode.h"
+
+/**********************************************************************************************************************/
+
+class KmerNode : public TaxonomyNode {
+       
+public:
+       KmerNode(string, int, int);
+       void loadSequence(vector<int>&);
+       void printTheta();
+       double getPxGivenkj_D_j(vector<int>&);
+       double getSimToConsensus(vector<int>&);
+       void checkTheta(){};
+       void setNumUniqueKmers(int num) {       numUniqueKmers = num;   }
+       int getNumUniqueKmers();
+       void addThetas(vector<int>, int);
+       vector<int> getTheta()  {       return kmerVector;      }
+
+
+private:
+       string getKmerBases(int);
+       int kmerSize;                                                           //      value of k
+       int numPossibleKmers;                                           //      4^kmerSize
+       int numUniqueKmers;                                                     //      number of unique kmers seen in a group ~ O_kj
+       int numKmers;                                                           //      number of kmers in a sequence
+       vector<int> kmerVector;                                         //      counts of kmers across all sequences in a node
+};
+
+/**********************************************************************************************************************/
+
+#endif
+
diff --git a/kmertree.cpp b/kmertree.cpp
new file mode 100755 (executable)
index 0000000..fbf2bfb
--- /dev/null
@@ -0,0 +1,386 @@
+//
+//  kmerTree.cpp
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "kmernode.h"
+#include "kmertree.h"
+
+/**************************************************************************************************/
+
+KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){
+       try {
+        KmerNode* newNode = new KmerNode("Root", 0, kmerSize);
+        tree.push_back(newNode);                       //      the tree is stored as a vector of elements of type TaxonomyNode
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        numPossibleKmers = power4s[kmerSize];
+        
+        string refTaxonomy;
+        
+        readTaxonomy(taxonomyFileName);
+        
+        ifstream referenceFile;
+        m->openInputFile(referenceFileName, referenceFile);
+        bool error = false;
+        while(!referenceFile.eof()){
+            
+            if (m->control_pressed) { break; }
+            
+            Sequence seq(referenceFile);  m->gobble(referenceFile);
+            
+            if (seq.getName() != "") {
+                map<string, string>::iterator it = taxonomy.find(seq.getName());
+                
+                if (it != taxonomy.end()) {
+                    refTaxonomy = it->second;          //      lookup the taxonomy string for the current reference sequence
+                    vector<int> kmerProfile = ripKmerProfile(seq.getUnaligned());      //convert to kmer vector
+                    addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile);
+                }else {
+                    m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+                }
+            }
+        }
+        referenceFile.close();
+        
+        if (error) { m->control_pressed = true; }
+        
+        numTaxa = (int)tree.size();
+        numLevels = 0;
+        for(int i=0;i<numTaxa;i++){
+            int level = tree[i]->getLevel();
+            if(level > numLevels){     numLevels = level;      }
+        }
+        numLevels++;
+        
+        aggregateThetas();
+        
+        int dbSize = tree[0]->getNumSeqs();
+        
+        for(int i=0;i<numTaxa;i++){
+            tree[i]->checkTheta();
+            tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers());
+            tree[i]->setTotalSeqs(dbSize);
+        }
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "KmerTree");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+KmerTree::~KmerTree(){
+       
+       for(int i=0;i<tree.size();i++){
+               delete tree[i];
+       }
+       
+}      
+/**********************************************************************************************************************/
+
+vector<int> KmerTree::ripKmerProfile(string sequence){
+    try {
+        //     assume all input sequences are unaligned
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        int nKmers = (int)sequence.length() - kmerSize + 1;
+        
+        vector<int> kmerProfile(numPossibleKmers + 1, 0);
+        
+        for(int i=0;i<nKmers;i++){
+            
+            if (m->control_pressed) { break; }
+            
+            int kmer = 0;
+            for(int j=0;j<kmerSize;j++){
+                if(toupper(sequence[j+i]) == 'A')              {       kmer += (0 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'C') {       kmer += (1 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'G') {       kmer += (2 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'U') {       kmer += (3 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'T') {       kmer += (3 * power4s[kmerSize-j-1]);    }
+                else                                                                   {       kmer = power4s[kmerSize]; j = kmerSize; }
+            }
+            kmerProfile[kmer] = 1;
+        }
+        
+        return kmerProfile;    
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "ripKmerProfile");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
+       try {
+        KmerNode* newNode;
+        string taxonName = "";
+        int treePosition = 0;                                                  //      the root is element 0
+        
+        
+        int level = 1;
+        
+        for(int i=0;i<taxonomy.length();i++){                  //      step through taxonomy string...
+            
+            if (m->control_pressed) { break; }
+            if(taxonomy[i] == ';'){                                            //      looking for semicolons...
+                
+                if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+                
+                int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already
+                //        has a child with the new taxonName
+                if(newIndex != -1)     {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
+                else {                                                                                                         //         position in the tree
+                    int newChildIndex = (int)tree.size();                                      //      otherwise, we'll have to create one...
+                    tree[treePosition]->makeChild(taxonName, newChildIndex);
+                    
+                    newNode = new KmerNode(taxonName, level, kmerSize);
+                    
+                    newNode->setParent(treePosition);
+                    
+                    tree.push_back(newNode);
+                    treePosition = newChildIndex;
+                }
+                
+                //     sequence data to that node to update that node's theta - seems slow...                          
+                taxonName = "";                                                                //      clear out the taxon name that we will build as we look 
+                level++;
+                
+            }                                                                                          //      for a semicolon
+            else{
+                taxonName += taxonomy[i];                                      //      keep adding letters until we reach a semicolon
+            }
+        }
+        
+        tree[treePosition]->loadSequence(sequence);    //      now that we've gotten to the correct node, add the
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "addTaxonomyToTree");
+               exit(1);
+       }
+       
+}
+
+/**************************************************************************************************/
+
+int KmerTree::aggregateThetas(){
+       try {
+        vector<vector<int> > levelMatrix(numLevels+1);
+        
+        for(int i=0;i<tree.size();i++){
+            if (m->control_pressed) { return 0; }
+            levelMatrix[tree[i]->getLevel()].push_back(i);
+        }
+        
+        for(int i=numLevels-1;i>0;i--) {
+            if (m->control_pressed) { return 0; }
+            
+            for(int j=0;j<levelMatrix[i].size();j++){
+                
+                KmerNode* holder = tree[levelMatrix[i][j]];
+                
+                tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                                
+            }
+        }
+        
+        return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "aggregateThetas");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::getMinRiskIndexKmer(vector<int>& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+       try {
+        int numProbs = (int)probabilities.size();
+        
+        vector<double> G(numProbs, 0.2);       //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever.
+        vector<double> risk(numProbs, 0);
+        
+        for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+            if (m->control_pressed) { return 0; }
+            G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+        }
+        
+        double minRisk = 1e6;
+        int minRiskIndex = 0;
+        
+        for(int i=0;i<numProbs;i++){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<numProbs;j++){
+                if(i != j){
+                    risk[i] += probabilities[j] * G[j];
+                }                      
+            }
+            
+            if(risk[i] < minRisk){
+                minRisk = risk[i];
+                minRiskIndex = i;
+            }
+        }
+        
+        return minRiskIndex;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "getMinRiskIndexKmer");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+       try {
+        int finalLevel = (int)indices.size()-1;
+        
+        for(int position=1;position<indices.size();position++){
+            if (m->control_pressed) { return 0; }
+            int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+            int actualParent = indices[position-1][maxIndices[position-1]];
+            
+            if(predictedParent != actualParent){
+                finalLevel = position - 1;
+                return finalLevel;
+            }
+        }
+        return finalLevel;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "sanityCheck");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+string KmerTree::getTaxonomy(Sequence* thisSeq){
+       try {
+        string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = "";
+        string unalignedSeq = thisSeq->getUnaligned();
+        
+        double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers());
+        
+        vector<int> queryProfile = ripKmerProfile(unalignedSeq);       //convert to kmer vector
+        
+        vector<vector<double> > pXgivenKj_D_j(numLevels);
+        vector<vector<int> > indices(numLevels);
+        for(int i=0;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[i].push_back(logPOutlier);
+            indices[i].push_back(-1);
+        }
+        
+        for(int i=0;i<numTaxa;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile));
+            indices[tree[i]->getLevel()].push_back(i);
+        }
+        
+        vector<double> sumLikelihood(numLevels, 0);
+        vector<double> bestPosterior(numLevels, 0);
+        vector<int> maxIndex(numLevels, 0);
+        int maxPosteriorIndex;
+        
+        //let's find the best level and taxa within that level
+        for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+            if (m->control_pressed) { return taxonProbabilityString; }
+            
+            int numTaxaInLevel = (int)indices[i].size();
+            
+            vector<double> posteriors(numTaxaInLevel, 0);              
+            sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+            
+            maxPosteriorIndex = 0;
+            for(int j=0;j<numTaxaInLevel;j++){
+                posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+                if(posteriors[j] > posteriors[maxPosteriorIndex]){     
+                    maxPosteriorIndex = j;
+                }
+                
+            }
+            
+            maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors);
+            
+            maxIndex[i] = maxPosteriorIndex;
+            bestPosterior[i] = posteriors[maxIndex[i]];        
+        }
+        
+        //     vector<double> pX_level(numLevels, 0);
+        //     
+        //     for(int i=0;i<numLevels;i++){
+        //             pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+        //     }
+        //     
+        //     int max_pLevel_X_index = -1;
+        //     double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+        //     double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+        //     
+        //     vector<double> pLevel_X(numLevels, 0);
+        //     for(int i=0;i<numLevels;i++){
+        //             pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+        //     }
+        
+        int saneDepth = sanityCheck(indices, maxIndex);
+        
+        
+        //     stringstream levelProbabilityOutput;
+        //     levelProbabilityOutput.setf(ios::fixed, ios::floatfield);
+        //     levelProbabilityOutput.setf(ios::showpoint);
+        
+        
+        //taxonProbabilityOutput << seqName << '\t';
+        //     taxonProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+        //     levelProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+        simpleTax = "";
+        int savedspot = 1;
+        taxonProbabilityString = "";
+        for(int i=1;i<=saneDepth;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int confidenceScore = (int) (bestPosterior[i] * 100);
+            if (confidenceScore >= confidenceThreshold) {
+                if(indices[i][maxIndex[i]] != -1){
+                    taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");";
+                    simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+                    
+                    //                 levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+                }
+                else{
+                    taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");";
+                    //                 levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+                    simpleTax += "unclassified;";
+                }
+            }else { break; }
+            savedspot = i;
+        }
+        
+        
+        
+        for(int i=savedspot+1;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            taxonProbabilityString += "unclassified(0);";
+            simpleTax += "unclassified;";
+        }
+        
+        return taxonProbabilityString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "getTaxonomy");
+               exit(1);
+       }
+}
+
+
+/**************************************************************************************************/
+
diff --git a/kmertree.h b/kmertree.h
new file mode 100755 (executable)
index 0000000..f7c10ef
--- /dev/null
@@ -0,0 +1,37 @@
+//
+//  kmerTree.h
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_kmerTree_h
+#define pdsBayesian_kmerTree_h
+
+#include "classify.h"
+
+class KmerNode;
+
+class KmerTree : public Classify {
+       
+public:
+       KmerTree(string, string, int, int);
+       ~KmerTree();
+       
+    string getTaxonomy(Sequence*);
+
+private:
+    int addTaxonomyToTree(string, string, vector<int>&);
+       vector<int> ripKmerProfile(string);
+       int getMinRiskIndexKmer(vector<int>&, vector<int>&, vector<double>&);
+       int aggregateThetas();
+       int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+       int kmerSize;
+       int numPossibleKmers, confidenceThreshold;
+       vector<KmerNode*> tree;
+
+};
+
+#endif
diff --git a/taxonomynode.cpp b/taxonomynode.cpp
new file mode 100755 (executable)
index 0000000..b90bda1
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ *  taxonomynode.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 7/8/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){
+    m = MothurOut::getInstance();
+       parent = -1;
+       numChildren = 0;
+       numSeqs = 0;
+}
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setName(string n)                   {       name = n;                                       }
+
+/**************************************************************************************************/
+
+string TaxonomyNode::getName()                                 {       return name;                            }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setParent(int p)                            {       parent = p;                                     }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getParent()                                  {       return parent;                          }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::makeChild(string c, int i)  {       children[c] = i;                        }
+
+
+/**************************************************************************************************/
+
+map<string, int> TaxonomyNode::getChildren()   {       return children;                        }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getChildIndex(string c){
+       map<string, int>::iterator it = children.find(c);
+       if(it != children.end())        {       return it->second;                      }
+       else                                            {       return -1;                                      }       
+}
+
+/**************************************************************************************************/
+
+int    TaxonomyNode::getNumKids()                                      {       return (int)children.size();            }
+
+/**************************************************************************************************/
+
+int    TaxonomyNode::getNumSeqs()                                      {       return numSeqs;                         }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setTotalSeqs(int n)                 {       totalSeqs = n;                          }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getLevel()                                   {       return level;                           }
+
+/**************************************************************************************************/
diff --git a/taxonomynode.h b/taxonomynode.h
new file mode 100755 (executable)
index 0000000..08bad3e
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef TAXONOMYNODE
+#define TAXONOMYNODE
+
+/*
+ *  taxonomynode.h
+ *  
+ *
+ *  Created by Pat Schloss on 7/8/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "mothurout.h"
+/**************************************************************************************************/
+
+class TaxonomyNode {
+       
+public:
+       TaxonomyNode();
+       TaxonomyNode(string, int);
+       void setName(string);
+       string getName();
+
+
+       void setParent(int);
+       int getParent();
+       
+       void makeChild(string, int);
+       map<string, int> getChildren();
+       int getChildIndex(string);
+       int     getNumKids();
+       int getNumSeqs();
+       void setTotalSeqs(int);
+       int getLevel();
+       
+private:
+       int parent;
+       map<string, int> children;
+       int numChildren;
+       int level;
+       
+protected:
+    MothurOut* m;
+       int numSeqs;
+       int totalSeqs;
+       string name;
+};
+
+/**************************************************************************************************/
+
+#endif