X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classifyseqscommand.h;h=59d9ee275800195255961639d77fb6c57d97f417;hp=acee70c5cb474dee4e25eea4a6dcf6b5bd55fa06;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=a33a385cc5b7481488f92f794425f01fbf40a543 diff --git a/classifyseqscommand.h b/classifyseqscommand.h index acee70c..59d9ee2 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -19,9 +19,11 @@ #include "phylotree.h" #include "phylosummary.h" #include "knn.h" +#include "kmertree.h" +#include "aligntree.h" -//KNN and Bayesian methods modeled from algorithms in +//KNN and Wang methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences //into the New Bacterial Taxonomy􏰎† //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* @@ -41,7 +43,9 @@ public: vector setParameters(); string getCommandName() { return "classify.seqs"; } string getCommandCategory() { return "Phylotype Analysis"; } + string getHelpString(); + string getOutputPattern(string); string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; } string getDescription() { return "classify sequences"; } @@ -61,6 +65,7 @@ private: vector lines; vector fastaFileNames; vector namefileNames; + vector countfileNames; vector groupfileNames; vector outputNames; map > nameMap; @@ -69,13 +74,12 @@ private: Classify* classify; ReferenceDB* rdb; - string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; + string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs, save, flip; + bool abort, probs, save, flip, hasName, hasCount, writeShortcuts; int driver(linePair*, string, string, string, string); - void appendTaxFiles(string, string); int createProcesses(string, string, string, string); string addUnclassifieds(string, int); @@ -99,10 +103,10 @@ struct classifyData { MothurOut* m; float match, misMatch, gapOpen, gapExtend; int count, kmerSize, threadID, cutoff, iters, numWanted; - bool probs, flip; + bool probs, flip, writeShortcuts; classifyData(){} - classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) { + classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) { accnos = acc; taxonomyFileName = tx; templateFileName = te; @@ -126,6 +130,7 @@ struct classifyData { probs = p; count = 0; flip = fli; + writeShortcuts = wsh; } }; @@ -158,21 +163,25 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); } - pDataArray->count = pDataArray->end; - //make classify Classify* myclassify; - if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } + string outputMethodTag = pDataArray->method + "."; + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else if(pDataArray->method == "zap"){ + outputMethodTag = pDataArray->search + "_" + outputMethodTag; + if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } + else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } + } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); - myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } if (pDataArray->m->control_pressed) { delete myclassify; return 0; } - int count = 0; + pDataArray->count = 0; for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process if (pDataArray->m->control_pressed) { delete myclassify; return 0; } @@ -198,15 +207,15 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } - count++; + pDataArray->count++; } delete candidateSeq; //report progress - if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); } + if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); } } //report progress - if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); } + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); } delete myclassify; inFASTA.close();