5 * Created by Pat Schloss on 12/22/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
15 /**************************************************************************************************/
17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM) : Database(),
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
22 int randNumber = rand();
23 //int randNumber = 12345;
25 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
26 pid += toString(getpid());
29 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
30 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
31 blastFileName = tag + pid + toString(randNumber) + ".blast";
33 //make sure blast exists in the write place
35 string tempPath = path;
36 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
37 path = path.substr(0, (tempPath.find_last_of('m')));
39 string formatdbCommand;
40 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
41 formatdbCommand = path + "blast/bin/formatdb"; // format the database, -o option gives us the ability
43 formatdbCommand = path + "blast\\bin\\formatdb.exe";
44 //wrap entire string in ""
45 //formatdbCommand = "\"" + formatdbCommand + "\"";
48 //test to make sure formatdb exists
50 formatdbCommand = m->getFullPathName(formatdbCommand);
51 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
52 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
55 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
56 blastCommand = path + "blast/bin/blastall"; // format the database, -o option gives us the ability
58 blastCommand = path + "blast\\bin\\blastall.exe";
59 //wrap entire string in ""
60 //blastCommand = "\"" + blastCommand + "\"";
63 //test to make sure formatdb exists
65 blastCommand = m->getFullPathName(blastCommand);
66 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
67 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
70 string megablastCommand;
71 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
72 megablastCommand = path + "blast/bin/megablast"; // format the database, -o option gives us the ability
74 megablastCommand = path + "blast\\bin\\megablast.exe";
75 //wrap entire string in ""
76 //megablastCommand = "\"" + megablastCommand + "\"";
79 //test to make sure formatdb exists
81 megablastCommand = m->getFullPathName(megablastCommand);
82 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
83 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
87 m->errorOut(e, "BlastDB", "BlastDB");
91 /**************************************************************************************************/
93 BlastDB::BlastDB() : Database() {
97 int randNumber = rand();
99 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
100 pid += toString(getpid());
103 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
104 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
105 blastFileName = pid + toString(randNumber) + ".blast";
107 //make sure blast exists in the write place
109 string tempPath = path;
110 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
111 path = path.substr(0, (tempPath.find_last_of('m')));
113 string formatdbCommand;
114 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
115 formatdbCommand = path + "blast/bin/formatdb"; // format the database, -o option gives us the ability
117 formatdbCommand = path + "blast\\bin\\formatdb.exe";
118 //wrap entire string in ""
119 //formatdbCommand = "\"" + formatdbCommand + "\"";
122 //test to make sure formatdb exists
124 formatdbCommand = m->getFullPathName(formatdbCommand);
125 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
126 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
129 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
130 blastCommand = path + "blast/bin/blastall"; // format the database, -o option gives us the ability
132 blastCommand = path + "blast\\bin\\blastall.exe";
133 //wrap entire string in ""
134 //blastCommand = "\"" + blastCommand + "\"";
137 //test to make sure formatdb exists
139 blastCommand = m->getFullPathName(blastCommand);
140 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
141 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
144 string megablastCommand;
145 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
146 megablastCommand = path + "blast/bin/megablast"; // format the database, -o option gives us the ability
148 megablastCommand = path + "blast\\bin\\megablast.exe";
149 //wrap entire string in ""
150 //megablastCommand = "\"" + megablastCommand + "\"";
153 //test to make sure formatdb exists
155 megablastCommand = m->getFullPathName(megablastCommand);
156 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
157 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
161 catch(exception& e) {
162 m->errorOut(e, "BlastDB", "BlastDB");
167 /**************************************************************************************************/
171 remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files
172 remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files
173 remove((dbFileName+".nsq").c_str()); // let's clean stuff up and remove the temp files
174 remove((dbFileName+".nsi").c_str()); // let's clean stuff up and remove the temp files
175 remove((dbFileName+".nsd").c_str()); // let's clean stuff up and remove the temp files
176 remove((dbFileName+".nin").c_str()); // let's clean stuff up and remove the temp files
177 remove((dbFileName+".nhr").c_str()); // let's clean stuff up and remove the temp files
178 remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
180 catch(exception& e) {
181 m->errorOut(e, "BlastDB", "~BlastDB");
185 /**************************************************************************************************/
186 //assumes you have added all the template sequences using the addSequence function and run generateDB.
187 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
189 vector<int> topMatches;
192 int randNumber = rand();
194 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
195 pid += toString(getpid());
198 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
199 queryFile << '>' << seq->getName() << endl;
200 queryFile << seq->getUnaligned() << endl;
204 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
205 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
206 // long. With this setting, it seems comparable in speed to the suffix tree approach.
209 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
211 blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
212 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
214 blastCommand = "\"" + path + "blast\\bin\\blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
215 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
216 //wrap entire string in ""
217 blastCommand = "\"" + blastCommand + "\"";
220 system(blastCommand.c_str());
222 ifstream m8FileHandle;
223 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
226 int templateAccession;
227 m->gobble(m8FileHandle);
229 while(!m8FileHandle.eof()){
230 m8FileHandle >> dummy >> templateAccession >> searchScore;
232 //get rest of junk in line
233 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
235 m->gobble(m8FileHandle);
236 topMatches.push_back(templateAccession);
238 m8FileHandle.close();
239 remove((queryFileName+pid+toString(randNumber)).c_str());
240 remove((blastFileName+pid+toString(randNumber)).c_str());
244 catch(exception& e) {
245 m->errorOut(e, "BlastDB", "findClosestSequences");
250 /**************************************************************************************************/
251 //assumes you have added all the template sequences using the addSequence function and run generateDB.
252 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
254 vector<int> topMatches;
255 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
259 int randNumber = rand();
261 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
262 pid += toString(getpid());
265 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
266 queryFile << '>' << seq->getName() << endl;
267 queryFile << seq->getUnaligned() << endl;
269 // cout << seq->getUnaligned() << endl;
270 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
271 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
272 // long. With this setting, it seems comparable in speed to the suffix tree approach.
273 //7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
275 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
276 blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
277 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
279 //blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
280 //blastCommand += (" -i " + (queryFileName+toString(randNumber)) + " -o " + blastFileName+toString(randNumber));
282 blastCommand = "\"" + path + "blast\\bin\\megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
283 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
284 //wrap entire string in ""
285 blastCommand = "\"" + blastCommand + "\"";
288 system(blastCommand.c_str());
290 ifstream m8FileHandle;
291 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
293 string dummy, eScore;
294 int templateAccession;
295 m->gobble(m8FileHandle);
297 while(!m8FileHandle.eof()){
298 m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
299 // cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
301 //get rest of junk in line
302 //while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; }else{ cout << c; } } //
304 m->gobble(m8FileHandle);
305 if (searchScore >= minPerID) {
306 topMatches.push_back(templateAccession);
307 Scores.push_back(searchScore);
309 //cout << templateAccession << endl;
311 m8FileHandle.close();
312 remove((queryFileName+pid+toString(randNumber)).c_str());
313 remove((blastFileName+pid+toString(randNumber)).c_str());
317 catch(exception& e) {
318 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
322 /**************************************************************************************************/
323 void BlastDB::addSequence(Sequence seq) {
326 ofstream unalignedFastaFile;
327 m->openOutputFileAppend(dbFileName, unalignedFastaFile);
329 // generating a fasta file with unaligned template
330 unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
331 unalignedFastaFile << seq.getUnaligned() << endl;
332 unalignedFastaFile.close();
336 catch(exception& e) {
337 m->errorOut(e, "BlastDB", "addSequence");
341 /**************************************************************************************************/
342 void BlastDB::generateDB() {
345 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
348 string tempPath = path;
349 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
350 path = path.substr(0, (tempPath.find_last_of('m')));
352 string formatdbCommand;
354 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
355 formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
357 //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
359 formatdbCommand = "\"" + path + "blast\\bin\\formatdb\" -p F -o T -i " + "\"" + dbFileName + "\"";
360 //wrap entire string in ""
361 formatdbCommand = "\"" + formatdbCommand + "\"";
363 //cout << formatdbCommand << endl;
364 system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
365 // option tells formatdb that seqs are DNA, not prot
366 //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine(); cout.flush();
368 catch(exception& e) {
369 m->errorOut(e, "BlastDB", "generateDB");
373 /**************************************************************************************************/
375 /**************************************************************************************************/