#include "classifyseqscommand.h"
#include "sequence.hpp"
-#include "phylotype.h"
#include "bayesian.h"
-#include "doTaxonomy.h"
+#include "phylotree.h"
#include "knn.h"
//**********************************************************************************************************************
ClassifySeqsCommand::ClassifySeqsCommand(string option){
try {
- // globaldata = GlobalData::getInstance();
abort = false;
//allow user to run help
else {
//valid paramters for this command
- string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+ string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
OptionParser parser(option);
search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
- method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "phylotype"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; }
temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
convert(temp, match);
temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; }
convert(temp, numWanted);
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0"; }
+ convert(temp, cutoff);
+
+ if ((method == "bayesian") && (search != "kmer")) {
+ mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
+ search = "kmer";
+ }
}
}
mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
mothurOut("The template, fasta and taxonomy parameters are required.\n");
mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
- mothurOut("The method parameter allows you to specify classification method to use. Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+ mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
+ mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n");
mothurOut("The classify.seqs command should be in the following format: \n");
mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
- mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+ mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
try {
if (abort == true) { return 0; }
- int start = time(NULL);
- if(method == "phylotype"){ classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); }
- //else if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search); }
+
+ if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff); }
else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
else {
- mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+ mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
mothurOutEndLine();
- classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);
}
int numFastaSeqs = 0;
string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+ string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
string taxSummary = getRootName(fastaFileName) + "tax.summary";
+ int start = time(NULL);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
ifstream inFASTA;
inFASTA.close();
lines.push_back(new linePair(0, numFastaSeqs));
-
- driver(lines[0], newTaxonomyFile);
-
+
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
}
else{
vector<int> positions;
}
lines.push_back(new linePair(startPos, numSeqsPerProcessor));
}
- createProcesses(newTaxonomyFile);
+ createProcesses(newTaxonomyFile, tempTaxonomyFile);
rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+ rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
for(int i=1;i<processors;i++){
appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+ appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
}
}
#else
ifstream inFASTA;
- openInputFile(candidateFileName, inFASTA);
+ openInputFile(fastaFileName, inFASTA);
numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
inFASTA.close();
lines.push_back(new linePair(0, numFastaSeqs));
- driver(lines[0], newTaxonomyFile);
-#endif
-
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
+#endif
delete classify;
//make taxonomy tree from new taxonomy file
ifstream inTaxonomy;
- openInputFile(newTaxonomyFile, inTaxonomy);
+ openInputFile(tempTaxonomyFile, inTaxonomy);
string accession, taxaList;
PhyloTree taxaBrowser;
gobble(inTaxonomy);
}
inTaxonomy.close();
+ remove(tempTaxonomyFile.c_str());
taxaBrowser.assignHeirarchyIDs(0);
}
/**************************************************************************************************/
-void ClassifySeqsCommand::createProcesses(string taxFileName) {
+void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 0;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+ driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
exit(0);
}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
}
//**********************************************************************************************************************
-int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
try {
ofstream outTax;
openOutputFile(taxFName, outTax);
+
+ ofstream outTaxSimple;
+ openOutputFile(tempTFName, outTaxSimple);
ifstream inFASTA;
openInputFile(fastaFileName, inFASTA);
if (taxonomy != "bad seq") {
outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
}
delete candidateSeq;
+
+ if((i+1) % 100 == 0){
+ mothurOut("Classifying sequence " + toString(i+1)); mothurOutEndLine();
+ }
}
inFASTA.close();
outTax.close();
+ outTaxSimple.close();
return 1;
}