X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classifyseqscommand.h;h=59d9ee275800195255961639d77fb6c57d97f417;hp=0e21a203caaa0c4a8166d7ecb08a685cd36f3169;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=0ca63a8165baa0afa459e644ebe140ba496d5ba0

diff --git a/classifyseqscommand.h b/classifyseqscommand.h
index 0e21a20..59d9ee2 100644
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -10,7 +10,7 @@
  *
  */
 
-#include "mothur.h"
+
 #include "command.hpp"
 #include "classify.h"
 #include "referencedb.h"
@@ -19,9 +19,11 @@
 #include "phylotree.h"
 #include "phylosummary.h"
 #include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
 
 
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
 //NaÄ±Â¨ve Bayesian Classiï¬er for Rapid Assignment of rRNA Sequences 
 //into the New Bacterial Taxonomyô°â  
 //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* 
@@ -41,7 +43,9 @@ public:
 	vector<string> setParameters();
 	string getCommandName()			{ return "classify.seqs";		}
 	string getCommandCategory()		{ return "Phylotype Analysis";	}
+	
 	string getHelpString();	
+    string getOutputPattern(string);	
 	string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; }
 	string getDescription()		{ return "classify sequences"; }
 	
@@ -61,6 +65,7 @@ private:
 	vector<linePair*> lines;
 	vector<string> fastaFileNames;
 	vector<string> namefileNames;
+    vector<string> countfileNames;
 	vector<string> groupfileNames;
 	vector<string> outputNames;
 	map<string, vector<string> > nameMap;
@@ -69,19 +74,18 @@ private:
 	Classify* classify;
 	ReferenceDB* rdb;
 	
-	string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
+	string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
 	int processors, kmerSize, numWanted, cutoff, iters;
 	float match, misMatch, gapOpen, gapExtend;
-	bool abort, probs, save;
+	bool abort, probs, save, flip, hasName, hasCount, writeShortcuts;
 	
-	int driver(linePair*, string, string, string);
-	void appendTaxFiles(string, string);
-	int createProcesses(string, string, string); 
+	int driver(linePair*, string, string, string, string);
+	int createProcesses(string, string, string, string); 
 	string addUnclassifieds(string, int);
 	
 	int MPIReadNamesFile(string);
 	#ifdef USE_MPI
-	int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
+	int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
 	#endif
 };
 
@@ -93,16 +97,17 @@ struct classifyData {
 	string taxFName; 
 	string tempTFName; 
 	string filename;
-	string search, taxonomyFileName, templateFileName, method;
+	string search, taxonomyFileName, templateFileName, method, accnos;
 	unsigned long long start;
 	unsigned long long end;
 	MothurOut* m;
 	float match, misMatch, gapOpen, gapExtend;
 	int count, kmerSize, threadID, cutoff, iters, numWanted;
-	bool probs;
+	bool probs, flip, writeShortcuts;
 	 
 	classifyData(){}
-	classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) {
+	classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) {
+		accnos = acc;
 		taxonomyFileName = tx;
 		templateFileName = te;
 		taxFName = a;
@@ -124,11 +129,13 @@ struct classifyData {
 		threadID = tid;
 		probs = p;
 		count = 0;
+		flip = fli;
+        writeShortcuts = wsh;
 	}
 };
 
 /**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 #else
 static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ 
 	classifyData* pDataArray;
@@ -141,6 +148,9 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
 		ofstream outTaxSimple;
 		pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
 		
+		ofstream outAcc;
+		pDataArray->m->openOutputFile(pDataArray->accnos, outAcc);
+		
 		ifstream inFASTA;
 		pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
 		
@@ -153,21 +163,25 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
 			inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); 
 		}
 		
-		pDataArray->count = pDataArray->end;
-		
 		//make classify
 		Classify* myclassify;
-		if(pDataArray->method == "bayesian"){	myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);		}
+        string outputMethodTag = pDataArray->method + ".";
+		if(pDataArray->method == "bayesian"){	myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);		}
 		else if(pDataArray->method == "knn"){	myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);				}
+        else if(pDataArray->method == "zap"){	
+            outputMethodTag = pDataArray->search + "_" + outputMethodTag;
+            if (pDataArray->search == "kmer") {   myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+            else {  myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff);  }
+        }
 		else {
 			pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
 			pDataArray->m->mothurOutEndLine();
-			myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);	
+			myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);	
 		}
 		
 		if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
 		
-		int count = 0;
+		pDataArray->count = 0;
 		for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
 			
 			if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
@@ -180,25 +194,28 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
 				
 				if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
 				
-				if (taxonomy != "bad seq") {
-					//output confidence scores or not
-					if (pDataArray->probs) {
-						outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
-					}else{
-						outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
-					}
-					
-					outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+				if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); }
+
+				//output confidence scores or not
+				if (pDataArray->probs) {
+					outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+				}else{
+					outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
 				}
-				count++;
+					
+				outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+					
+				if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; }
+				
+				pDataArray->count++;
 			}
 			delete candidateSeq;
 			//report progress
-			if((count) % 100 == 0){	pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();		}
+			if((pDataArray->count) % 100 == 0){	pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); 	}
 			
 		}
 		//report progress
-		if((count) % 100 != 0){	pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();		}
+		if((pDataArray->count) % 100 != 0){	pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); 		}
 		
 		delete myclassify;
 		inFASTA.close();