X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classifyseqscommand.h;h=59d9ee275800195255961639d77fb6c57d97f417;hp=a73aa0c576f1dc6a3437d9c37557aa89cc0b451c;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=7b3c9ca940891c1b20b3b7ec13e05d7e7b316b63 diff --git a/classifyseqscommand.h b/classifyseqscommand.h index a73aa0c..59d9ee2 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -10,39 +10,228 @@ * */ -#include "mothur.h" + #include "command.hpp" -#include "alignment.hpp" #include "classify.h" +#include "referencedb.h" +#include "sequence.hpp" +#include "bayesian.h" +#include "phylotree.h" +#include "phylosummary.h" +#include "knn.h" +#include "kmertree.h" +#include "aligntree.h" + + +//KNN and Wang methods modeled from algorithms in +//Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences +//into the New Bacterial Taxonomy􏰎† +//Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* +//Center for Microbial Ecology1 and Department of Microbiology and Molecular Genetics,2 Michigan State University, +//East Lansing, Michigan 48824 +//Received 10 January 2007/Accepted 18 June 2007 + + class ClassifySeqsCommand : public Command { public: - ClassifySeqsCommand(string); + ClassifySeqsCommand(string); + ClassifySeqsCommand(); ~ClassifySeqsCommand(); + + vector setParameters(); + string getCommandName() { return "classify.seqs"; } + string getCommandCategory() { return "Phylotype Analysis"; } + + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; } + string getDescription() { return "classify sequences"; } + int execute(); - void help(); + void help() { m->mothurOut(getHelpString()); } + + private: struct linePair { - int start; - int numSeqs; - linePair(int i, int j) : start(i), numSeqs(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; + vector fastaFileNames; + vector namefileNames; + vector countfileNames; + vector groupfileNames; + vector outputNames; + map > nameMap; + map >::iterator itNames; Classify* classify; + ReferenceDB* rdb; - string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName; - int processors, kmerSize, numWanted, cutoff; + string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; + int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort; + bool abort, probs, save, flip, hasName, hasCount, writeShortcuts; - int driver(linePair*, string, string); - void appendTaxFiles(string, string); - void createProcesses(string, string); + int driver(linePair*, string, string, string, string); + int createProcesses(string, string, string, string); + string addUnclassifieds(string, int); + + int MPIReadNamesFile(string); + #ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + #endif +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct classifyData { + string taxFName; + string tempTFName; + string filename; + string search, taxonomyFileName, templateFileName, method, accnos; + unsigned long long start; + unsigned long long end; + MothurOut* m; + float match, misMatch, gapOpen, gapExtend; + int count, kmerSize, threadID, cutoff, iters, numWanted; + bool probs, flip, writeShortcuts; + + classifyData(){} + classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) { + accnos = acc; + taxonomyFileName = tx; + templateFileName = te; + taxFName = a; + tempTFName = r; + filename = f; + search = se; + method = me; + m = mout; + start = st; + end = en; + match = ma; + misMatch = misMa; + gapOpen = gapO; + gapExtend = gapE; + kmerSize = ks; + cutoff = cut; + iters = i; + numWanted = numW; + threadID = tid; + probs = p; + count = 0; + flip = fli; + writeShortcuts = wsh; + } }; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ + classifyData* pDataArray; + pDataArray = (classifyData*)lpParam; + + try { + ofstream outTax; + pDataArray->m->openOutputFile(pDataArray->taxFName, outTax); + + ofstream outTaxSimple; + pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple); + + ofstream outAcc; + pDataArray->m->openOutputFile(pDataArray->accnos, outAcc); + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + + string taxonomy; + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); + } + + //make classify + Classify* myclassify; + string outputMethodTag = pDataArray->method + "."; + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } + else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else if(pDataArray->method == "zap"){ + outputMethodTag = pDataArray->search + "_" + outputMethodTag; + if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } + else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } + } + else { + pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); + pDataArray->m->mothurOutEndLine(); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); + } + + if (pDataArray->m->control_pressed) { delete myclassify; return 0; } + + pDataArray->count = 0; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { delete myclassify; return 0; } + + Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA); + + if (candidateSeq->getName() != "") { + + taxonomy = myclassify->getTaxonomy(candidateSeq); + + if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; } + + if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); } + + //output confidence scores or not + if (pDataArray->probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + } + + outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + + if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + + pDataArray->count++; + } + delete candidateSeq; + //report progress + if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); } + + } + //report progress + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); } + + delete myclassify; + inFASTA.close(); + outTax.close(); + outTaxSimple.close(); + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction"); + exit(1); + } +} +#endif + + + + #endif