5 * Created by Pat Schloss on 12/22/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
15 /**************************************************************************************************/
17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM, string b, int tid) : Database(),
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
24 int randNumber = rand();
25 //int randNumber = 12345;
26 string pid = m->mothurGetpid(threadID);
28 if (m->debug) { m->mothurOut("[DEBUG]: tag = " + tag + "\t pid = " + pid + "\n"); }
30 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
31 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
32 blastFileName = tag + pid + toString(randNumber) + ".blast";
34 //make sure blast exists in the write place
37 string tempPath = path;
38 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
39 path = path.substr(0, (tempPath.find_last_of('m')));
41 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
44 path += "blast\\bin\\";
49 string formatdbCommand;
50 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
51 formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
53 formatdbCommand = path + "formatdb.exe";
56 //test to make sure formatdb exists
58 formatdbCommand = m->getFullPathName(formatdbCommand);
59 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
60 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
63 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
64 blastCommand = path + "blastall"; // format the database, -o option gives us the ability
66 blastCommand = path + "blastall.exe";
67 //wrap entire string in ""
68 //blastCommand = "\"" + blastCommand + "\"";
71 //test to make sure formatdb exists
73 blastCommand = m->getFullPathName(blastCommand);
74 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
75 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
78 string megablastCommand;
79 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
80 megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
82 megablastCommand = path + "megablast.exe";
85 //test to make sure formatdb exists
87 megablastCommand = m->getFullPathName(megablastCommand);
88 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
89 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
93 m->errorOut(e, "BlastDB", "BlastDB");
97 /**************************************************************************************************/
99 BlastDB::BlastDB(string b, int tid) : Database() {
106 //make sure blast exists in the write place
109 string tempPath = path;
110 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
111 path = path.substr(0, (tempPath.find_last_of('m')));
113 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
114 path += "blast/bin/";
116 path += "blast\\bin\\";
120 int randNumber = rand();
121 string pid = m->mothurGetpid(threadID);
122 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
123 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
124 blastFileName = pid + toString(randNumber) + ".blast";
126 string formatdbCommand;
127 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
128 formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
130 formatdbCommand = path + "formatdb.exe";
131 //wrap entire string in ""
132 //formatdbCommand = "\"" + formatdbCommand + "\"";
135 //test to make sure formatdb exists
137 formatdbCommand = m->getFullPathName(formatdbCommand);
138 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
139 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
142 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
143 blastCommand = path + "blastall"; // format the database, -o option gives us the ability
145 blastCommand = path + "blastall.exe";
146 //wrap entire string in ""
147 //blastCommand = "\"" + blastCommand + "\"";
150 //test to make sure formatdb exists
152 blastCommand = m->getFullPathName(blastCommand);
153 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
154 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
157 string megablastCommand;
158 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
159 megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
161 megablastCommand = path + "megablast.exe";
162 //wrap entire string in ""
163 //megablastCommand = "\"" + megablastCommand + "\"";
166 //test to make sure formatdb exists
168 megablastCommand = m->getFullPathName(megablastCommand);
169 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
170 if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
174 catch(exception& e) {
175 m->errorOut(e, "BlastDB", "BlastDB");
180 /**************************************************************************************************/
184 m->mothurRemove(queryFileName); // let's clean stuff up and remove the temp files
185 m->mothurRemove(dbFileName); // let's clean stuff up and remove the temp files
186 m->mothurRemove((dbFileName+".nsq")); // let's clean stuff up and remove the temp files
187 m->mothurRemove((dbFileName+".nsi")); // let's clean stuff up and remove the temp files
188 m->mothurRemove((dbFileName+".nsd")); // let's clean stuff up and remove the temp files
189 m->mothurRemove((dbFileName+".nin")); // let's clean stuff up and remove the temp files
190 m->mothurRemove((dbFileName+".nhr")); // let's clean stuff up and remove the temp files
191 m->mothurRemove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
193 catch(exception& e) {
194 m->errorOut(e, "BlastDB", "~BlastDB");
198 /**************************************************************************************************/
199 //assumes you have added all the template sequences using the addSequence function and run generateDB.
200 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
202 vector<int> topMatches;
205 int randNumber = rand();
206 string pid = scrubName(seq->getName());
208 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
209 queryFile << '>' << seq->getName() << endl;
210 queryFile << seq->getUnaligned() << endl;
214 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
215 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
216 // long. With this setting, it seems comparable in speed to the suffix tree approach.
219 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
221 blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
222 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
224 blastCommand = "\"" + path + "blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
225 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
226 //wrap entire string in ""
227 blastCommand = "\"" + blastCommand + "\"";
230 system(blastCommand.c_str());
232 ifstream m8FileHandle;
233 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
236 int templateAccession;
237 m->gobble(m8FileHandle);
239 while(!m8FileHandle.eof()){
240 m8FileHandle >> dummy >> templateAccession >> searchScore;
242 //get rest of junk in line
243 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
245 m->gobble(m8FileHandle);
246 topMatches.push_back(templateAccession);
248 m8FileHandle.close();
249 m->mothurRemove((queryFileName+pid+toString(randNumber)));
250 m->mothurRemove((blastFileName+pid+toString(randNumber)));
254 catch(exception& e) {
255 m->errorOut(e, "BlastDB", "findClosestSequences");
260 /**************************************************************************************************/
261 //assumes you have added all the template sequences using the addSequence function and run generateDB.
262 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
264 vector<int> topMatches;
265 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
269 int randNumber = rand();
270 string pid = scrubName(seq->getName());
272 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
273 queryFile << '>' << seq->getName() << endl;
274 queryFile << seq->getUnaligned() << endl;
276 // cout << seq->getUnaligned() << endl;
277 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
278 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
279 // long. With this setting, it seems comparable in speed to the suffix tree approach.
280 //7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
282 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
283 blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
284 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
286 //blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
287 //blastCommand += (" -i " + (queryFileName+toString(randNumber)) + " -o " + blastFileName+toString(randNumber));
289 blastCommand = "\"" + path + "megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
290 blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
291 //wrap entire string in ""
292 blastCommand = "\"" + blastCommand + "\"";
295 system(blastCommand.c_str());
297 ifstream m8FileHandle;
298 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
300 string dummy, eScore;
301 int templateAccession;
302 m->gobble(m8FileHandle);
304 while(!m8FileHandle.eof()){
305 m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
306 // cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
308 //get rest of junk in line
309 //while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; }else{ cout << c; } } //
311 m->gobble(m8FileHandle);
312 if (searchScore >= minPerID) {
313 topMatches.push_back(templateAccession);
314 Scores.push_back(searchScore);
316 //cout << templateAccession << endl;
318 m8FileHandle.close();
319 m->mothurRemove((queryFileName+pid+toString(randNumber)));
320 m->mothurRemove((blastFileName+pid+toString(randNumber)));
324 catch(exception& e) {
325 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
329 /**************************************************************************************************/
330 void BlastDB::addSequence(Sequence seq) {
333 ofstream unalignedFastaFile;
334 m->openOutputFileAppend(dbFileName, unalignedFastaFile);
336 // generating a fasta file with unaligned template
337 unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
338 unalignedFastaFile << seq.getUnaligned() << endl;
339 unalignedFastaFile.close();
343 catch(exception& e) {
344 m->errorOut(e, "BlastDB", "addSequence");
348 /**************************************************************************************************/
349 void BlastDB::generateDB() {
352 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
354 string formatdbCommand;
356 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
357 formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
359 //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
361 formatdbCommand = "\"" + path + "formatdb\" -p F -o T -i " + "\"" + dbFileName + "\"";
362 //wrap entire string in ""
363 formatdbCommand = "\"" + formatdbCommand + "\"";
365 //cout << formatdbCommand << endl;
366 system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
367 // option tells formatdb that seqs are DNA, not prot
368 //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine(); cout.flush();
370 catch(exception& e) {
371 m->errorOut(e, "BlastDB", "generateDB");
375 /**************************************************************************************************/
376 string BlastDB::scrubName(string seqName) {
379 string cleanName = "";
381 for (int i = 0; i < seqName.length(); i++) {
382 if (isalnum(seqName[i])) { cleanName += seqName[i]; }
383 else { cleanName += "_"; }
388 catch(exception& e) {
389 m->errorOut(e, "BlastDB", "scrubName");
393 /**************************************************************************************************/
395 /**************************************************************************************************/