]> git.donarmstrong.com Git - mothur.git/blob - blastdb.cpp
added modify names parameter to set.dir
[mothur.git] / blastdb.cpp
1 /*
2  *  blastdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/22/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
14
15 /**************************************************************************************************/
16
17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM, string b, int tid) : Database(), 
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
19         try {
20                 count = 0;
21                 path = b;
22                 threadID = tid;
23
24                 int randNumber = rand();
25                 //int randNumber = 12345;
26                 string pid = "";
27 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
28                 pid += getpid();        
29 #else
30                 pid += toString(threadID);      
31 #endif
32                 
33                 
34                 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
35                 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
36                 blastFileName = tag + pid + toString(randNumber) + ".blast";
37                 
38                 //make sure blast exists in the write place
39                 if (path == "") {
40                         path = m->argv;
41                         string tempPath = path;
42                         for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
43                         path = path.substr(0, (tempPath.find_last_of('m')));
44                         
45 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
46                         path += "blast/bin/";   
47 #else
48                         path += "blast\\bin\\";
49 #endif                  
50                 }
51                 
52                 
53                 string formatdbCommand;
54 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
55                 formatdbCommand = path + "formatdb";    //      format the database, -o option gives us the ability
56 #else
57                 formatdbCommand = path + "formatdb.exe";
58 #endif
59                 
60                 //test to make sure formatdb exists
61                 ifstream in;
62                 formatdbCommand = m->getFullPathName(formatdbCommand);
63                 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
64                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
65                 
66                 string blastCommand;
67 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
68                 blastCommand = path + "blastall";       //      format the database, -o option gives us the ability
69 #else
70                 blastCommand = path + "blastall.exe";
71                 //wrap entire string in ""
72                 //blastCommand = "\"" + blastCommand + "\"";
73 #endif
74                 
75                 //test to make sure formatdb exists
76                 ifstream in2;
77                 blastCommand = m->getFullPathName(blastCommand);
78                 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
79                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
80                 
81                 
82                 string megablastCommand;
83 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
84                 megablastCommand = path + "megablast";  //      format the database, -o option gives us the ability
85 #else
86                 megablastCommand = path + "megablast.exe";
87 #endif
88                 
89                 //test to make sure formatdb exists
90                 ifstream in3;
91                 megablastCommand = m->getFullPathName(megablastCommand);
92                 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
93                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " +  megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
94                 
95         }
96         catch(exception& e) {
97                 m->errorOut(e, "BlastDB", "BlastDB");
98                 exit(1);
99         }
100 }
101 /**************************************************************************************************/
102
103 BlastDB::BlastDB(string b, int tid) : Database() {
104         try {
105                 count = 0;
106                 
107                 path = b;
108                 threadID = tid;
109                 
110                 //make sure blast exists in the write place
111                 if (path == "") {
112                         path = m->argv;
113                         string tempPath = path;
114                         for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
115                         path = path.substr(0, (tempPath.find_last_of('m')));
116                         
117 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
118                         path += "blast/bin/";   
119 #else
120                         path += "blast\\bin\\";
121 #endif                  
122                 }
123                 
124                 int randNumber = rand();
125                 string pid = "";
126 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
127                 pid += getpid();        
128 #else
129                 pid += toString(threadID);      
130 #endif
131                 
132                 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
133                 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
134                 blastFileName = pid + toString(randNumber) + ".blast";
135                 
136                 string formatdbCommand;
137 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
138                 formatdbCommand = path + "formatdb";    //      format the database, -o option gives us the ability
139 #else
140                 formatdbCommand = path + "formatdb.exe";
141                 //wrap entire string in ""
142                 //formatdbCommand = "\"" + formatdbCommand + "\"";
143 #endif
144                 
145                 //test to make sure formatdb exists
146                 ifstream in;
147                 formatdbCommand = m->getFullPathName(formatdbCommand);
148                 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
149                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " +  formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
150                 
151                 string blastCommand;
152 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
153                 blastCommand = path + "blastall";       //      format the database, -o option gives us the ability
154 #else
155                 blastCommand = path + "blastall.exe";
156                 //wrap entire string in ""
157                 //blastCommand = "\"" + blastCommand + "\"";
158 #endif
159                 
160                 //test to make sure formatdb exists
161                 ifstream in2;
162                 blastCommand = m->getFullPathName(blastCommand);
163                 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
164                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
165                 
166                 
167                 string megablastCommand;
168 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
169                 megablastCommand = path + "megablast";  //      format the database, -o option gives us the ability
170 #else
171                 megablastCommand = path + "megablast.exe";
172                 //wrap entire string in ""
173                 //megablastCommand = "\"" + megablastCommand + "\"";
174 #endif
175                 
176                 //test to make sure formatdb exists
177                 ifstream in3;
178                 megablastCommand = m->getFullPathName(megablastCommand);
179                 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
180                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
181                 
182                 
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "BlastDB", "BlastDB");
186                 exit(1);
187         }
188 }
189
190 /**************************************************************************************************/
191
192 BlastDB::~BlastDB(){
193         try{
194                 m->mothurRemove(queryFileName);                         //      let's clean stuff up and remove the temp files
195                 m->mothurRemove(dbFileName);                                    //      let's clean stuff up and remove the temp files
196                 m->mothurRemove((dbFileName+".nsq"));                                   //      let's clean stuff up and remove the temp files
197                 m->mothurRemove((dbFileName+".nsi"));                                   //      let's clean stuff up and remove the temp files
198                 m->mothurRemove((dbFileName+".nsd"));                                   //      let's clean stuff up and remove the temp files
199                 m->mothurRemove((dbFileName+".nin"));                                   //      let's clean stuff up and remove the temp files
200                 m->mothurRemove((dbFileName+".nhr"));                                   //      let's clean stuff up and remove the temp files
201                 m->mothurRemove(blastFileName.c_str());                         //      let's clean stuff up and remove the temp files
202         }
203         catch(exception& e) {
204                 m->errorOut(e, "BlastDB", "~BlastDB");
205                 exit(1);
206         }
207 }
208 /**************************************************************************************************/
209 //assumes you have added all the template sequences using the addSequence function and run generateDB.
210 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
211         try{
212                 vector<int> topMatches;
213                 
214                 ofstream queryFile;
215                 int randNumber = rand();
216                 string pid = scrubName(seq->getName());
217                 
218                 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
219                 queryFile << '>' << seq->getName() << endl;
220                 queryFile << seq->getUnaligned() << endl;
221                 queryFile.close();
222
223                                 
224                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
225                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
226                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
227                 
228                 string blastCommand;
229                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
230                 
231                         blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
232                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
233                 #else
234                         blastCommand =  "\"" + path + "blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
235                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
236                         //wrap entire string in ""
237                         blastCommand = "\"" + blastCommand + "\"";
238                 #endif
239                 
240                 system(blastCommand.c_str());
241                 
242                 ifstream m8FileHandle;
243                 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
244                 
245                 string dummy;
246                 int templateAccession;
247                 m->gobble(m8FileHandle);
248                 
249                 while(!m8FileHandle.eof()){
250                         m8FileHandle >> dummy >> templateAccession >> searchScore;
251                         
252                         //get rest of junk in line
253                         while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
254                         
255                         m->gobble(m8FileHandle);
256                         topMatches.push_back(templateAccession);
257                 }
258                 m8FileHandle.close();
259                 m->mothurRemove((queryFileName+pid+toString(randNumber)));
260                 m->mothurRemove((blastFileName+pid+toString(randNumber)));
261
262                 return topMatches;
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "BlastDB", "findClosestSequences");
266                 exit(1);
267         }
268
269 }
270 /**************************************************************************************************/
271 //assumes you have added all the template sequences using the addSequence function and run generateDB.
272 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
273         try{
274                 vector<int> topMatches;
275                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
276                 Scores.clear();
277                 
278                 ofstream queryFile;
279                 int randNumber = rand();
280                 string pid = scrubName(seq->getName());
281                 
282                 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
283                 queryFile << '>' << seq->getName() << endl;
284                 queryFile << seq->getUnaligned() << endl;
285                 queryFile.close();
286 //              cout << seq->getUnaligned() << endl;
287                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
288                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
289                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
290 //7000004128189528left  0       100             66      0       0       1       66      61      126     1e-31    131    
291                 string blastCommand;
292                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
293                         blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
294                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
295                 #else
296                 //blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
297                 //blastCommand += (" -i " + (queryFileName+toString(randNumber)) + " -o " + blastFileName+toString(randNumber));
298
299                         blastCommand =  "\"" + path + "megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
300                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
301                         //wrap entire string in ""
302                         blastCommand = "\"" + blastCommand + "\"";
303
304                 #endif
305                 system(blastCommand.c_str());
306
307                 ifstream m8FileHandle;
308                 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
309         
310                 string dummy, eScore;
311                 int templateAccession;
312                 m->gobble(m8FileHandle);
313                 
314                 while(!m8FileHandle.eof()){
315                         m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
316 //                      cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl; 
317                         
318                         //get rest of junk in line
319                         //while (!m8FileHandle.eof())   {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }else{ cout << c; }     } //
320                                 //cout << endl;
321                         m->gobble(m8FileHandle);
322                         if (searchScore >= minPerID) { 
323                                 topMatches.push_back(templateAccession);
324                                 Scores.push_back(searchScore);
325                         }
326 //cout << templateAccession << endl;
327                 }
328                 m8FileHandle.close();
329                 m->mothurRemove((queryFileName+pid+toString(randNumber)));
330                 m->mothurRemove((blastFileName+pid+toString(randNumber)));
331 //cout << "\n" ;                
332                 return topMatches;
333         }
334         catch(exception& e) {
335                 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
336                 exit(1);
337         }
338 }
339 /**************************************************************************************************/
340 void BlastDB::addSequence(Sequence seq) {
341         try {
342         
343                 ofstream unalignedFastaFile;
344                 m->openOutputFileAppend(dbFileName, unalignedFastaFile);                                
345         
346                 //      generating a fasta file with unaligned template
347                 unalignedFastaFile << '>' << count << endl;                                     //      sequences, which will be input to formatdb
348                 unalignedFastaFile << seq.getUnaligned() << endl;
349                 unalignedFastaFile.close();
350         
351                 count++;
352         }
353         catch(exception& e) {
354                 m->errorOut(e, "BlastDB", "addSequence");
355                 exit(1);
356         }
357 }
358 /**************************************************************************************************/
359 void BlastDB::generateDB() {
360         try {
361         
362                 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
363                         
364                 string formatdbCommand;
365                 
366                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
367                         formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName; //      format the database, -o option gives us the ability
368                 #else
369                         //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName;   //      format the database, -o option gives us the ability
370
371                         formatdbCommand = "\"" + path + "formatdb\" -p F -o T -i " + "\"" +  dbFileName + "\"";
372                         //wrap entire string in ""
373                         formatdbCommand = "\"" + formatdbCommand + "\"";
374                 #endif
375                 //cout << formatdbCommand << endl;
376                 system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
377                                                                                                                                         //      option tells formatdb that seqs are DNA, not prot
378                 //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine(); cout.flush();
379         }
380         catch(exception& e) {
381                 m->errorOut(e, "BlastDB", "generateDB");
382                 exit(1);
383         }
384 }
385 /**************************************************************************************************/
386 string BlastDB::scrubName(string seqName) {
387         try {
388                 
389                 string cleanName = "";
390                 
391                 for (int i = 0; i < seqName.length(); i++) {
392                         if (isalnum(seqName[i])) { cleanName += seqName[i]; }
393                         else { cleanName += "_";  }
394                 }
395                 
396                 return cleanName;
397         }
398         catch(exception& e) {
399                 m->errorOut(e, "BlastDB", "scrubName");
400                 exit(1);
401         }
402 }
403 /**************************************************************************************************/
404
405 /**************************************************************************************************/
406