X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=blastdb.cpp;h=979d507c23d61b4537d62a1a4736721624616fb2;hb=d051154d778b77ba36dc603bbdb3c148a62a8e33;hp=ef8f737ee8bc5507b5c7a24dd71c3a1bd27d80ce;hpb=526a868606faa50caf86e7399f7554c0335b39e5;p=mothur.git

diff --git a/blastdb.cpp b/blastdb.cpp
index ef8f737..979d507 100644
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -7,8 +7,6 @@
  *
  */
 
-using namespace std;
-
 
 #include "database.hpp"
 #include "sequence.hpp"
@@ -16,80 +14,190 @@ using namespace std;
 
 /**************************************************************************************************/
 
-BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName), 
+BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(), 
 gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
-
-	cout << "Generating the temporary BLAST database...\t";	cout.flush();
+	
+	globaldata = GlobalData::getInstance();
+	count = 0;
 
 	int randNumber = rand();
 	dbFileName = toString(randNumber) + ".template.unaligned.fasta";
 	queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
 	blastFileName = toString(randNumber) + ".blast";
 
+}
+/**************************************************************************************************/
 
-	ofstream unalignedFastaFile;
-	openOutputFile(dbFileName, unalignedFastaFile);				
-	
-	for(int i=0;i<numSeqs;i++){									//	generating a fasta file with unaligned template
-		unalignedFastaFile << '>' << i << endl;					//	sequences, which will be input to formatdb
-		unalignedFastaFile << templateSequences[i]->getUnaligned() << endl;
-	}
-	unalignedFastaFile.close();
-	
-	string formatdbCommand = "~/Pipeline/src/cpp/production/blast/bin/formatdb -p F -o T -i " + dbFileName;	//	format the database, -o option gives us the ability
-	system(formatdbCommand.c_str());								//	to get the right sequence names, i think. -p F
-																	//	option tells formatdb that seqs are DNA, not prot
-	cout << "DONE." << endl << endl;	cout.flush();
-	emptySequence = new Sequence();
-	emptySequence->setName("no_match");
-	emptySequence->setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
-	emptySequence->setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
-
+BlastDB::BlastDB() : Database() {
+	try {
+		globaldata = GlobalData::getInstance();
+		count = 0;
 
+		int randNumber = rand();
+		dbFileName = toString(randNumber) + ".template.unaligned.fasta";
+		queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
+		blastFileName = toString(randNumber) + ".blast";
+	}
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "BlastDB");
+		exit(1);
+	}
 }
 
 /**************************************************************************************************/
 
 BlastDB::~BlastDB(){
-	remove(queryFileName.c_str());				//	let's clean stuff up and remove the temp files
-	remove(dbFileName.c_str());					//	let's clean stuff up and remove the temp files
-	remove(blastFileName.c_str());				//	let's clean stuff up and remove the temp files
+	try{
+		remove(queryFileName.c_str());				//	let's clean stuff up and remove the temp files
+		remove(dbFileName.c_str());					//	let's clean stuff up and remove the temp files
+		remove((dbFileName+".nsq").c_str());					//	let's clean stuff up and remove the temp files
+		remove((dbFileName+".nsi").c_str());					//	let's clean stuff up and remove the temp files
+		remove((dbFileName+".nsd").c_str());					//	let's clean stuff up and remove the temp files
+		remove((dbFileName+".nin").c_str());					//	let's clean stuff up and remove the temp files
+		remove((dbFileName+".nhr").c_str());					//	let's clean stuff up and remove the temp files
+		remove(blastFileName.c_str());				//	let's clean stuff up and remove the temp files
+	}
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "~BlastDB");
+		exit(1);
+	}
 }
-
 /**************************************************************************************************/
+//assumes you have added all the template sequences using the addSequence function and run generateDB.
+vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
+	try{
+		vector<int> topMatches;
+		
+		ofstream queryFile;
+		m->openOutputFile((queryFileName+seq->getName()), queryFile);
+		queryFile << '>' << seq->getName() << endl;
+		queryFile << seq->getUnaligned() << endl;
+		queryFile.close();
+
+				
+		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
+		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
+		
+		string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
+		blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
+		system(blastCommand.c_str());
+		
+		ifstream m8FileHandle;
+		m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
+		
+		string dummy;
+		int templateAccession;
+		m->gobble(m8FileHandle);
+		
+		while(!m8FileHandle.eof()){
+			m8FileHandle >> dummy >> templateAccession >> searchScore;
+			
+			//get rest of junk in line
+			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}	} 
+			
+			m->gobble(m8FileHandle);
+			topMatches.push_back(templateAccession);
+		}
+		m8FileHandle.close();
+		remove((queryFileName+seq->getName()).c_str());
+		remove((blastFileName+seq->getName()).c_str());
 
-Sequence* BlastDB::findClosestSequence(Sequence* candidate){
+		return topMatches;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "findClosestSequences");
+		exit(1);
+	}
 
-	ofstream queryFile;
-	openOutputFile(queryFileName, queryFile);
-	queryFile << '>' << candidate->getName() << endl;
-	queryFile << candidate->getUnaligned() << endl;
-	queryFile.close();
-	
+}
+/**************************************************************************************************/
+//assumes you have added all the template sequences using the addSequence function and run generateDB.
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
+	try{
+		vector<int> topMatches;
+		
+		ofstream queryFile;
+		m->openOutputFile((queryFileName+seq->getName()), queryFile);
+		queryFile << '>' << seq->getName() << endl;
+		queryFile << seq->getUnaligned() << endl;
+		queryFile.close();
+				
+		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
+		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
 	
-//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
-//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
-//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
+		string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
+		blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
+		system(blastCommand.c_str());
 
-	string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28";
-	blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
-	system(blastCommand.c_str());
-	
-	ifstream m8FileHandle;
-	openInputFile(blastFileName, m8FileHandle);
+		ifstream m8FileHandle;
+		m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
 	
-	string dummy;
-	int templateAccession;
-	gobble(m8FileHandle);
-	if(!m8FileHandle.eof()){
-		m8FileHandle >> dummy >> templateAccession >> searchScore;
+		string dummy;
+		int templateAccession;
+		m->gobble(m8FileHandle);
+		
+		while(!m8FileHandle.eof()){
+			m8FileHandle >> dummy >> templateAccession >> searchScore;
+			
+			//get rest of junk in line
+			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}	} 
+			
+			m->gobble(m8FileHandle);
+			topMatches.push_back(templateAccession);
+//cout << templateAccession << endl;
+		}
 		m8FileHandle.close();
-		return templateSequences[templateAccession];
+		remove((queryFileName+seq->getName()).c_str());
+		remove((blastFileName+seq->getName()).c_str());
+//cout << "\n\n" ;		
+		return topMatches;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "findClosest");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void BlastDB::addSequence(Sequence seq) {
+	try {
+	
+		ofstream unalignedFastaFile;
+		m->openOutputFileAppend(dbFileName, unalignedFastaFile);				
+	
+		//	generating a fasta file with unaligned template
+		unalignedFastaFile << '>' << count << endl;					//	sequences, which will be input to formatdb
+		unalignedFastaFile << seq.getUnaligned() << endl;
+		unalignedFastaFile.close();
+	
+		count++;
 	}
-	else{
-		searchScore = 0.00;
-		return emptySequence;
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "addSequence");
+		exit(1);
 	}
 }
+/**************************************************************************************************/
+void BlastDB::generateDB() {
+	try {
+	
+		//m->mothurOut("Generating the temporary BLAST database...\t");	cout.flush();
+		
+		path = globaldata->argv;
+		path = path.substr(0, (path.find_last_of('m')));
+	
+		string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName;	//	format the database, -o option gives us the ability
+		system(formatdbCommand.c_str());								//	to get the right sequence names, i think. -p F
+																	//	option tells formatdb that seqs are DNA, not prot
+		//m->mothurOut("DONE."); m->mothurOutEndLine();	m->mothurOutEndLine(); cout.flush();
+	}
+	catch(exception& e) {
+		m->errorOut(e, "BlastDB", "generateDB");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
 
 /**************************************************************************************************/
+