]> git.donarmstrong.com Git - mothur.git/blobdiff - classifyseqscommand.h
changes while testing
[mothur.git] / classifyseqscommand.h
index 84f5f9f04d6752728c5f3bf72d5688cafd73b723..59d9ee275800195255961639d77fb6c57d97f417 100644 (file)
  *
  */
 
-#include "mothur.h"
+
 #include "command.hpp"
-//#include "alignment.hpp"
 #include "classify.h"
+#include "referencedb.h"
+#include "sequence.hpp"
+#include "bayesian.h"
+#include "phylotree.h"
+#include "phylosummary.h"
+#include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
+
 
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
 //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences 
 //into the New Bacterial Taxonomy􏰎† 
 //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* 
@@ -29,42 +37,201 @@ class ClassifySeqsCommand : public Command {
        
 public:
        ClassifySeqsCommand(string);
+       ClassifySeqsCommand();
        ~ClassifySeqsCommand();
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "classify.seqs";               }
+       string getCommandCategory()             { return "Phylotype Analysis";  }
+       
+       string getHelpString(); 
+    string getOutputPattern(string);   
+       string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; }
+       string getDescription()         { return "classify sequences"; }
+       
        int execute(); 
-       void help();    
+       void help() { m->mothurOut(getHelpString()); }  
+       
+       
        
 private:
        struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
        };
 
        vector<int> processIDS;   //processid
        vector<linePair*> lines;
        vector<string> fastaFileNames;
        vector<string> namefileNames;
+    vector<string> countfileNames;
        vector<string> groupfileNames;
+       vector<string> outputNames;
        map<string, vector<string> > nameMap;
        map<string,  vector<string> >::iterator itNames;
        
        Classify* classify;
+       ReferenceDB* rdb;
        
-       string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
+       string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
        int processors, kmerSize, numWanted, cutoff, iters;
        float match, misMatch, gapOpen, gapExtend;
-       bool abort, probs;
+       bool abort, probs, save, flip, hasName, hasCount, writeShortcuts;
        
-       int driver(linePair*, string, string, string);
-       void appendTaxFiles(string, string);
-       int createProcesses(string, string, string); 
+       int driver(linePair*, string, string, string, string);
+       int createProcesses(string, string, string, string); 
        string addUnclassifieds(string, int);
        
        int MPIReadNamesFile(string);
        #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
        #endif
 };
 
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct classifyData {
+       string taxFName; 
+       string tempTFName; 
+       string filename;
+       string search, taxonomyFileName, templateFileName, method, accnos;
+       unsigned long long start;
+       unsigned long long end;
+       MothurOut* m;
+       float match, misMatch, gapOpen, gapExtend;
+       int count, kmerSize, threadID, cutoff, iters, numWanted;
+       bool probs, flip, writeShortcuts;
+        
+       classifyData(){}
+       classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) {
+               accnos = acc;
+               taxonomyFileName = tx;
+               templateFileName = te;
+               taxFName = a;
+               tempTFName = r;
+               filename = f;
+               search = se;
+               method = me;
+               m = mout;
+               start = st;
+               end = en;
+               match = ma; 
+               misMatch = misMa;
+               gapOpen = gapO; 
+               gapExtend = gapE; 
+               kmerSize = ks;
+               cutoff = cut;
+               iters = i;
+               numWanted = numW;
+               threadID = tid;
+               probs = p;
+               count = 0;
+               flip = fli;
+        writeShortcuts = wsh;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ 
+       classifyData* pDataArray;
+       pDataArray = (classifyData*)lpParam;
+       
+       try {
+               ofstream outTax;
+               pDataArray->m->openOutputFile(pDataArray->taxFName, outTax);
+               
+               ofstream outTaxSimple;
+               pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
+               
+               ofstream outAcc;
+               pDataArray->m->openOutputFile(pDataArray->accnos, outAcc);
+               
+               ifstream inFASTA;
+               pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+               
+               string taxonomy;
+                               
+               //print header if you are process 0
+               if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+                       inFASTA.seekg(0);
+               }else { //this accounts for the difference in line endings. 
+                       inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); 
+               }
+               
+               //make classify
+               Classify* myclassify;
+        string outputMethodTag = pDataArray->method + ".";
+               if(pDataArray->method == "bayesian"){   myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);             }
+               else if(pDataArray->method == "knn"){   myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);                           }
+        else if(pDataArray->method == "zap"){  
+            outputMethodTag = pDataArray->search + "_" + outputMethodTag;
+            if (pDataArray->search == "kmer") {   myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+            else {  myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff);  }
+        }
+               else {
+                       pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
+                       pDataArray->m->mothurOutEndLine();
+                       myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);     
+               }
+               
+               if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+               
+               pDataArray->count = 0;
+               for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+                       
+                       if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+                       
+                       Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
+                       
+                       if (candidateSeq->getName() != "") {
+                               
+                               taxonomy = myclassify->getTaxonomy(candidateSeq);
+                               
+                               if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
+                               
+                               if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); }
+
+                               //output confidence scores or not
+                               if (pDataArray->probs) {
+                                       outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                               }else{
+                                       outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+                               }
+                                       
+                               outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+                                       
+                               if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
+                               
+                               pDataArray->count++;
+                       }
+                       delete candidateSeq;
+                       //report progress
+                       if((pDataArray->count) % 100 == 0){     pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n");       }
+                       
+               }
+               //report progress
+               if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n");               }
+               
+               delete myclassify;
+               inFASTA.close();
+               outTax.close();
+               outTaxSimple.close();
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+
+
+
+
 #endif