1 #ifndef CLASSIFYSEQSCOMMAND_H
2 #define CLASSIFYSEQSCOMMAND_H
5 * classifyseqscommand.h
8 * Created by westcott on 11/2/09.
9 * Copyright 2009 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "referencedb.h"
17 #include "sequence.hpp"
19 #include "phylotree.h"
20 #include "phylosummary.h"
23 #include "aligntree.h"
26 //KNN and Wang methods modeled from algorithms in
27 //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences
28 //into the New Bacterial Taxonomy†
29 //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1*
30 //Center for Microbial Ecology1 and Department of Microbiology and Molecular Genetics,2 Michigan State University,
31 //East Lansing, Michigan 48824
32 //Received 10 January 2007/Accepted 18 June 2007
36 class ClassifySeqsCommand : public Command {
39 ClassifySeqsCommand(string);
40 ClassifySeqsCommand();
41 ~ClassifySeqsCommand();
43 vector<string> setParameters();
44 string getCommandName() { return "classify.seqs"; }
45 string getCommandCategory() { return "Phylotype Analysis"; }
47 string getHelpString();
48 string getOutputPattern(string);
49 string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; }
50 string getDescription() { return "classify sequences"; }
53 void help() { m->mothurOut(getHelpString()); }
59 unsigned long long start;
60 unsigned long long end;
61 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
64 vector<int> processIDS; //processid
65 vector<linePair*> lines;
66 vector<string> fastaFileNames;
67 vector<string> namefileNames;
68 vector<string> countfileNames;
69 vector<string> groupfileNames;
70 vector<string> outputNames;
71 map<string, vector<string> > nameMap;
72 map<string, vector<string> >::iterator itNames;
77 string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
78 int processors, kmerSize, numWanted, cutoff, iters;
79 float match, misMatch, gapOpen, gapExtend;
80 bool abort, probs, save, flip, hasName, hasCount, writeShortcuts;
82 int driver(linePair*, string, string, string, string);
83 int createProcesses(string, string, string, string);
84 string addUnclassifieds(string, int);
86 int MPIReadNamesFile(string);
88 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
92 /**************************************************************************************************/
93 //custom data structure for threads to use.
94 // This is passed by void pointer so it can be any data type
95 // that can be passed using a single void pointer (LPVOID).
100 string search, taxonomyFileName, templateFileName, method, accnos;
101 unsigned long long start;
102 unsigned long long end;
104 float match, misMatch, gapOpen, gapExtend;
105 int count, kmerSize, threadID, cutoff, iters, numWanted;
106 bool probs, flip, writeShortcuts;
109 classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) {
111 taxonomyFileName = tx;
112 templateFileName = te;
133 writeShortcuts = wsh;
137 /**************************************************************************************************/
138 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
140 static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
141 classifyData* pDataArray;
142 pDataArray = (classifyData*)lpParam;
146 pDataArray->m->openOutputFile(pDataArray->taxFName, outTax);
148 ofstream outTaxSimple;
149 pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
152 pDataArray->m->openOutputFile(pDataArray->accnos, outAcc);
155 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
159 //print header if you are process 0
160 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
162 }else { //this accounts for the difference in line endings.
163 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
167 Classify* myclassify;
168 string outputMethodTag = pDataArray->method + ".";
169 if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); }
170 else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
171 else if(pDataArray->method == "zap"){
172 outputMethodTag = pDataArray->search + "_" + outputMethodTag;
173 if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
174 else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); }
177 pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
178 pDataArray->m->mothurOutEndLine();
179 myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);
182 if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
184 pDataArray->count = 0;
185 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
187 if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
189 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
191 if (candidateSeq->getName() != "") {
193 taxonomy = myclassify->getTaxonomy(candidateSeq);
195 if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
197 if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); }
199 //output confidence scores or not
200 if (pDataArray->probs) {
201 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
203 outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
206 outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
208 if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
214 if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
218 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
223 outTaxSimple.close();
226 catch(exception& e) {
227 pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction");