5 * Created by Pat Schloss on 12/22/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
15 /**************************************************************************************************/
17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM, string b) : Database(),
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
23 int randNumber = rand();
24 //int randNumber = 12345;
26 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
27 pid += toString(getpid());
30 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
31 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
32 blastFileName = tag + pid + toString(randNumber) + ".blast";
34 //make sure blast exists in the write place
37 string tempPath = path;
38 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
39 path = path.substr(0, (tempPath.find_last_of('m')));
41 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
44 path += "blast\\bin\\";
49 string formatdbCommand;
50 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
51 formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
53 formatdbCommand = path + "formatdb.exe";
56 //test to make sure formatdb exists
58 formatdbCommand = m->getFullPathName(formatdbCommand);
59 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
60 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
63 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
64 blastCommand = path + "blastall"; // format the database, -o option gives us the ability
66 blastCommand = path + "blastall.exe";
67 //wrap entire string in ""
68 //blastCommand = "\"" + blastCommand + "\"";
71 //test to make sure formatdb exists
73 blastCommand = m->getFullPathName(blastCommand);
74 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
75 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
78 string megablastCommand;
79 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
80 megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
82 megablastCommand = path + "megablast.exe";
85 //test to make sure formatdb exists
87 megablastCommand = m->getFullPathName(megablastCommand);
88 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
89 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
93 m->errorOut(e, "BlastDB", "BlastDB");
97 /**************************************************************************************************/
99 BlastDB::BlastDB(string b) : Database() {
105 //make sure blast exists in the write place
108 string tempPath = path;
109 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
110 path = path.substr(0, (tempPath.find_last_of('m')));
112 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
113 path += "blast/bin/";
115 path += "blast\\bin\\";
119 int randNumber = rand();
121 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
122 pid += toString(getpid());
125 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
126 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
127 blastFileName = pid + toString(randNumber) + ".blast";
129 string formatdbCommand;
130 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
131 formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
133 formatdbCommand = path + "formatdb.exe";
134 //wrap entire string in ""
135 //formatdbCommand = "\"" + formatdbCommand + "\"";
138 //test to make sure formatdb exists
140 formatdbCommand = m->getFullPathName(formatdbCommand);
141 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
142 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
145 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
146 blastCommand = path + "blastall"; // format the database, -o option gives us the ability
148 blastCommand = path + "blastall.exe";
149 //wrap entire string in ""
150 //blastCommand = "\"" + blastCommand + "\"";
153 //test to make sure formatdb exists
155 blastCommand = m->getFullPathName(blastCommand);
156 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
157 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
160 string megablastCommand;
161 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
162 megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
164 megablastCommand = path + "megablast.exe";
165 //wrap entire string in ""
166 //megablastCommand = "\"" + megablastCommand + "\"";
169 //test to make sure formatdb exists
171 megablastCommand = m->getFullPathName(megablastCommand);
172 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
173 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
177 catch(exception& e) {
178 m->errorOut(e, "BlastDB", "BlastDB");
183 /**************************************************************************************************/
187 m->mothurRemove(queryFileName); // let's clean stuff up and remove the temp files
188 m->mothurRemove(dbFileName); // let's clean stuff up and remove the temp files
189 m->mothurRemove((dbFileName+".nsq")); // let's clean stuff up and remove the temp files
190 m->mothurRemove((dbFileName+".nsi")); // let's clean stuff up and remove the temp files
191 m->mothurRemove((dbFileName+".nsd")); // let's clean stuff up and remove the temp files
192 m->mothurRemove((dbFileName+".nin")); // let's clean stuff up and remove the temp files
193 m->mothurRemove((dbFileName+".nhr")); // let's clean stuff up and remove the temp files
194 m->mothurRemove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
196 catch(exception& e) {
197 m->errorOut(e, "BlastDB", "~BlastDB");
201 /**************************************************************************************************/
202 //assumes you have added all the template sequences using the addSequence function and run generateDB.
203 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
205 vector<int> topMatches;
208 int randNumber = rand();
210 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
211 pid += toString(getpid());
214 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
215 queryFile << '>' << seq->getName() << endl;
216 queryFile << seq->getUnaligned() << endl;
220 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
221 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
222 // long. With this setting, it seems comparable in speed to the suffix tree approach.
225 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
227 blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
228 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
230 blastCommand = "\"" + path + "blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
231 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
232 //wrap entire string in ""
233 blastCommand = "\"" + blastCommand + "\"";
236 system(blastCommand.c_str());
238 ifstream m8FileHandle;
239 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
242 int templateAccession;
243 m->gobble(m8FileHandle);
245 while(!m8FileHandle.eof()){
246 m8FileHandle >> dummy >> templateAccession >> searchScore;
248 //get rest of junk in line
249 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
251 m->gobble(m8FileHandle);
252 topMatches.push_back(templateAccession);
254 m8FileHandle.close();
255 m->mothurRemove((queryFileName+pid+toString(randNumber)));
256 m->mothurRemove((blastFileName+pid+toString(randNumber)));
260 catch(exception& e) {
261 m->errorOut(e, "BlastDB", "findClosestSequences");
266 /**************************************************************************************************/
267 //assumes you have added all the template sequences using the addSequence function and run generateDB.
268 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
270 vector<int> topMatches;
271 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
275 int randNumber = rand();
277 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
278 pid += toString(getpid());
281 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
282 queryFile << '>' << seq->getName() << endl;
283 queryFile << seq->getUnaligned() << endl;
285 // cout << seq->getUnaligned() << endl;
286 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
287 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
288 // long. With this setting, it seems comparable in speed to the suffix tree approach.
289 //7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
291 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
292 blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
293 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
295 //blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
296 //blastCommand += (" -i " + (queryFileName+toString(randNumber)) + " -o " + blastFileName+toString(randNumber));
298 blastCommand = "\"" + path + "megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
299 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
300 //wrap entire string in ""
301 blastCommand = "\"" + blastCommand + "\"";
304 system(blastCommand.c_str());
306 ifstream m8FileHandle;
307 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
309 string dummy, eScore;
310 int templateAccession;
311 m->gobble(m8FileHandle);
313 while(!m8FileHandle.eof()){
314 m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
315 // cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
317 //get rest of junk in line
318 //while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; }else{ cout << c; } } //
320 m->gobble(m8FileHandle);
321 if (searchScore >= minPerID) {
322 topMatches.push_back(templateAccession);
323 Scores.push_back(searchScore);
325 //cout << templateAccession << endl;
327 m8FileHandle.close();
328 m->mothurRemove((queryFileName+pid+toString(randNumber)));
329 m->mothurRemove((blastFileName+pid+toString(randNumber)));
333 catch(exception& e) {
334 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
338 /**************************************************************************************************/
339 void BlastDB::addSequence(Sequence seq) {
342 ofstream unalignedFastaFile;
343 m->openOutputFileAppend(dbFileName, unalignedFastaFile);
345 // generating a fasta file with unaligned template
346 unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
347 unalignedFastaFile << seq.getUnaligned() << endl;
348 unalignedFastaFile.close();
352 catch(exception& e) {
353 m->errorOut(e, "BlastDB", "addSequence");
357 /**************************************************************************************************/
358 void BlastDB::generateDB() {
361 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
363 string formatdbCommand;
365 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
366 formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
368 //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
370 formatdbCommand = "\"" + path + "formatdb\" -p F -o T -i " + "\"" + dbFileName + "\"";
371 //wrap entire string in ""
372 formatdbCommand = "\"" + formatdbCommand + "\"";
374 //cout << formatdbCommand << endl;
375 system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
376 // option tells formatdb that seqs are DNA, not prot
377 //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine(); cout.flush();
379 catch(exception& e) {
380 m->errorOut(e, "BlastDB", "generateDB");
384 /**************************************************************************************************/
386 /**************************************************************************************************/