#include "phylotree.h"
#include "phylosummary.h"
#include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
//Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences
//into the New Bacterial Taxonomy†
//Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1*
vector<string> setParameters();
string getCommandName() { return "classify.seqs"; }
string getCommandCategory() { return "Phylotype Analysis"; }
+
string getHelpString();
+ string getOutputPattern(string);
string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; }
string getDescription() { return "classify sequences"; }
vector<linePair*> lines;
vector<string> fastaFileNames;
vector<string> namefileNames;
+ vector<string> countfileNames;
vector<string> groupfileNames;
vector<string> outputNames;
map<string, vector<string> > nameMap;
Classify* classify;
ReferenceDB* rdb;
- string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
+ string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
int processors, kmerSize, numWanted, cutoff, iters;
float match, misMatch, gapOpen, gapExtend;
- bool abort, probs, save, flip;
+ bool abort, probs, save, flip, hasName, hasCount, writeShortcuts;
int driver(linePair*, string, string, string, string);
- void appendTaxFiles(string, string);
int createProcesses(string, string, string, string);
string addUnclassifieds(string, int);
MothurOut* m;
float match, misMatch, gapOpen, gapExtend;
int count, kmerSize, threadID, cutoff, iters, numWanted;
- bool probs, flip;
+ bool probs, flip, writeShortcuts;
classifyData(){}
- classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) {
+ classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) {
accnos = acc;
taxonomyFileName = tx;
templateFileName = te;
probs = p;
count = 0;
flip = fli;
+ writeShortcuts = wsh;
}
};
inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
}
- pDataArray->count = pDataArray->end;
-
//make classify
Classify* myclassify;
- if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); }
+ string outputMethodTag = pDataArray->method + ".";
+ if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); }
else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
+ else if(pDataArray->method == "zap"){
+ outputMethodTag = pDataArray->search + "_" + outputMethodTag;
+ if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+ else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); }
+ }
else {
pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
pDataArray->m->mothurOutEndLine();
- myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip);
+ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);
}
if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
- int count = 0;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
- count++;
+ pDataArray->count++;
}
delete candidateSeq;
//report progress
- if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
}
//report progress
- if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
delete myclassify;
inFASTA.close();