]> git.donarmstrong.com Git - mothur.git/blob - blastdb.cpp
rewrote metastats command in c++, added mothurRemove function to handle ~ error....
[mothur.git] / blastdb.cpp
1 /*
2  *  blastdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/22/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
14
15 /**************************************************************************************************/
16
17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM) : Database(), 
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
19         try {
20                 count = 0;
21
22                 int randNumber = rand();
23                 //int randNumber = 12345;
24                 string pid = "";
25 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
26                 pid +=  toString(getpid());
27 #endif
28                 
29                 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
30                 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
31                 blastFileName = tag + pid + toString(randNumber) + ".blast";
32                 
33                 //make sure blast exists in the write place
34                 path = m->argv;
35                 string tempPath = path;
36                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
37                 path = path.substr(0, (tempPath.find_last_of('m')));
38                 
39                 string formatdbCommand;
40 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
41                 formatdbCommand = path + "blast/bin/formatdb";  //      format the database, -o option gives us the ability
42 #else
43                 formatdbCommand = path + "blast\\bin\\formatdb.exe";
44                 //wrap entire string in ""
45                 //formatdbCommand = "\"" + formatdbCommand + "\"";
46 #endif
47                 
48                 //test to make sure formatdb exists
49                 ifstream in;
50                 formatdbCommand = m->getFullPathName(formatdbCommand);
51                 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
52                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
53                 
54                 string blastCommand;
55 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
56                 blastCommand = path + "blast/bin/blastall";     //      format the database, -o option gives us the ability
57 #else
58                 blastCommand = path + "blast\\bin\\blastall.exe";
59                 //wrap entire string in ""
60                 //blastCommand = "\"" + blastCommand + "\"";
61 #endif
62                 
63                 //test to make sure formatdb exists
64                 ifstream in2;
65                 blastCommand = m->getFullPathName(blastCommand);
66                 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
67                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
68                 
69                 
70                 string megablastCommand;
71 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
72                 megablastCommand = path + "blast/bin/megablast";        //      format the database, -o option gives us the ability
73 #else
74                 megablastCommand = path + "blast\\bin\\megablast.exe";
75                 //wrap entire string in ""
76                 //megablastCommand = "\"" + megablastCommand + "\"";
77 #endif
78                 
79                 //test to make sure formatdb exists
80                 ifstream in3;
81                 megablastCommand = m->getFullPathName(megablastCommand);
82                 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
83                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " +  megablastCommand + " file does not exist. mothur requires megablast.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
84                 
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "BlastDB", "BlastDB");
88                 exit(1);
89         }
90 }
91 /**************************************************************************************************/
92
93 BlastDB::BlastDB() : Database() {
94         try {
95                 count = 0;
96
97                 int randNumber = rand();
98                 string pid = "";
99 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
100                 pid +=  toString(getpid());
101 #endif
102                 
103                 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
104                 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
105                 blastFileName = pid + toString(randNumber) + ".blast";
106                 
107                 //make sure blast exists in the write place
108                 path = m->argv;
109                 string tempPath = path;
110                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
111                 path = path.substr(0, (tempPath.find_last_of('m')));
112                 
113                 string formatdbCommand;
114 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
115                 formatdbCommand = path + "blast/bin/formatdb";  //      format the database, -o option gives us the ability
116 #else
117                 formatdbCommand = path + "blast\\bin\\formatdb.exe";
118                 //wrap entire string in ""
119                 //formatdbCommand = "\"" + formatdbCommand + "\"";
120 #endif
121                 
122                 //test to make sure formatdb exists
123                 ifstream in;
124                 formatdbCommand = m->getFullPathName(formatdbCommand);
125                 int ableToOpen = m->openInputFile(formatdbCommand, in, "no error"); in.close();
126                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " +  formatdbCommand + " file does not exist. mothur requires formatdb.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
127                 
128                 string blastCommand;
129 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
130                 blastCommand = path + "blast/bin/blastall";     //      format the database, -o option gives us the ability
131 #else
132                 blastCommand = path + "blast\\bin\\blastall.exe";
133                 //wrap entire string in ""
134                 //blastCommand = "\"" + blastCommand + "\"";
135 #endif
136                 
137                 //test to make sure formatdb exists
138                 ifstream in2;
139                 blastCommand = m->getFullPathName(blastCommand);
140                 ableToOpen = m->openInputFile(blastCommand, in2, "no error"); in2.close();
141                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
142                 
143                 
144                 string megablastCommand;
145 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
146                 megablastCommand = path + "blast/bin/megablast";        //      format the database, -o option gives us the ability
147 #else
148                 megablastCommand = path + "blast\\bin\\megablast.exe";
149                 //wrap entire string in ""
150                 //megablastCommand = "\"" + megablastCommand + "\"";
151 #endif
152                 
153                 //test to make sure formatdb exists
154                 ifstream in3;
155                 megablastCommand = m->getFullPathName(megablastCommand);
156                 ableToOpen = m->openInputFile(megablastCommand, in3, "no error"); in3.close();
157                 if(ableToOpen == 1) {   m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe to be in the ./blast/bin folder relative to the mothur.exe location."); m->mothurOutEndLine(); m->control_pressed = true; }
158                 
159                 
160         }
161         catch(exception& e) {
162                 m->errorOut(e, "BlastDB", "BlastDB");
163                 exit(1);
164         }
165 }
166
167 /**************************************************************************************************/
168
169 BlastDB::~BlastDB(){
170         try{
171                 m->mothurRemove(queryFileName);                         //      let's clean stuff up and remove the temp files
172                 m->mothurRemove(dbFileName);                                    //      let's clean stuff up and remove the temp files
173                 m->mothurRemove((dbFileName+".nsq"));                                   //      let's clean stuff up and remove the temp files
174                 m->mothurRemove((dbFileName+".nsi"));                                   //      let's clean stuff up and remove the temp files
175                 m->mothurRemove((dbFileName+".nsd"));                                   //      let's clean stuff up and remove the temp files
176                 m->mothurRemove((dbFileName+".nin"));                                   //      let's clean stuff up and remove the temp files
177                 m->mothurRemove((dbFileName+".nhr"));                                   //      let's clean stuff up and remove the temp files
178                 m->mothurRemove(blastFileName.c_str());                         //      let's clean stuff up and remove the temp files
179         }
180         catch(exception& e) {
181                 m->errorOut(e, "BlastDB", "~BlastDB");
182                 exit(1);
183         }
184 }
185 /**************************************************************************************************/
186 //assumes you have added all the template sequences using the addSequence function and run generateDB.
187 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
188         try{
189                 vector<int> topMatches;
190                 
191                 ofstream queryFile;
192                 int randNumber = rand();
193                 string pid = "";
194 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
195                 pid +=  toString(getpid());
196 #endif
197                 
198                 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
199                 queryFile << '>' << seq->getName() << endl;
200                 queryFile << seq->getUnaligned() << endl;
201                 queryFile.close();
202
203                                 
204                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
205                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
206                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
207                 
208                 string blastCommand;
209                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
210                 
211                         blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
212                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
213                 #else
214                         blastCommand =  "\"" + path + "blast\\bin\\blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
215                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
216                         //wrap entire string in ""
217                         blastCommand = "\"" + blastCommand + "\"";
218                 #endif
219                 
220                 system(blastCommand.c_str());
221                 
222                 ifstream m8FileHandle;
223                 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
224                 
225                 string dummy;
226                 int templateAccession;
227                 m->gobble(m8FileHandle);
228                 
229                 while(!m8FileHandle.eof()){
230                         m8FileHandle >> dummy >> templateAccession >> searchScore;
231                         
232                         //get rest of junk in line
233                         while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
234                         
235                         m->gobble(m8FileHandle);
236                         topMatches.push_back(templateAccession);
237                 }
238                 m8FileHandle.close();
239                 m->mothurRemove((queryFileName+pid+toString(randNumber)));
240                 m->mothurRemove((blastFileName+pid+toString(randNumber)));
241
242                 return topMatches;
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "BlastDB", "findClosestSequences");
246                 exit(1);
247         }
248
249 }
250 /**************************************************************************************************/
251 //assumes you have added all the template sequences using the addSequence function and run generateDB.
252 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
253         try{
254                 vector<int> topMatches;
255                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
256                 Scores.clear();
257                 
258                 ofstream queryFile;
259                 int randNumber = rand();
260                 string pid = "";
261 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
262                 pid += toString(getpid());
263 #endif
264                 
265                 m->openOutputFile((queryFileName+pid+toString(randNumber)), queryFile);
266                 queryFile << '>' << seq->getName() << endl;
267                 queryFile << seq->getUnaligned() << endl;
268                 queryFile.close();
269 //              cout << seq->getUnaligned() << endl;
270                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
271                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
272                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
273 //7000004128189528left  0       100             66      0       0       1       66      61      126     1e-31    131    
274                 string blastCommand;
275                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
276                         blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
277                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
278                 #else
279                 //blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
280                 //blastCommand += (" -i " + (queryFileName+toString(randNumber)) + " -o " + blastFileName+toString(randNumber));
281
282                         blastCommand =  "\"" + path + "blast\\bin\\megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
283                         blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
284                         //wrap entire string in ""
285                         blastCommand = "\"" + blastCommand + "\"";
286
287                 #endif
288                 system(blastCommand.c_str());
289
290                 ifstream m8FileHandle;
291                 m->openInputFile(blastFileName+pid+toString(randNumber), m8FileHandle, "no error");
292         
293                 string dummy, eScore;
294                 int templateAccession;
295                 m->gobble(m8FileHandle);
296                 
297                 while(!m8FileHandle.eof()){
298                         m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
299 //                      cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl; 
300                         
301                         //get rest of junk in line
302                         //while (!m8FileHandle.eof())   {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }else{ cout << c; }     } //
303                                 //cout << endl;
304                         m->gobble(m8FileHandle);
305                         if (searchScore >= minPerID) { 
306                                 topMatches.push_back(templateAccession);
307                                 Scores.push_back(searchScore);
308                         }
309 //cout << templateAccession << endl;
310                 }
311                 m8FileHandle.close();
312                 m->mothurRemove((queryFileName+pid+toString(randNumber)));
313                 m->mothurRemove((blastFileName+pid+toString(randNumber)));
314 //cout << "\n" ;                
315                 return topMatches;
316         }
317         catch(exception& e) {
318                 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
319                 exit(1);
320         }
321 }
322 /**************************************************************************************************/
323 void BlastDB::addSequence(Sequence seq) {
324         try {
325         
326                 ofstream unalignedFastaFile;
327                 m->openOutputFileAppend(dbFileName, unalignedFastaFile);                                
328         
329                 //      generating a fasta file with unaligned template
330                 unalignedFastaFile << '>' << count << endl;                                     //      sequences, which will be input to formatdb
331                 unalignedFastaFile << seq.getUnaligned() << endl;
332                 unalignedFastaFile.close();
333         
334                 count++;
335         }
336         catch(exception& e) {
337                 m->errorOut(e, "BlastDB", "addSequence");
338                 exit(1);
339         }
340 }
341 /**************************************************************************************************/
342 void BlastDB::generateDB() {
343         try {
344         
345                 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
346                 
347                 path = m->argv;
348                 string tempPath = path;
349                 for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
350                 path = path.substr(0, (tempPath.find_last_of('m')));
351         
352                 string formatdbCommand;
353                 
354                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
355                         formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName;       //      format the database, -o option gives us the ability
356                 #else
357                         //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName;   //      format the database, -o option gives us the ability
358
359                         formatdbCommand = "\"" + path + "blast\\bin\\formatdb\" -p F -o T -i " + "\"" +  dbFileName + "\"";
360                         //wrap entire string in ""
361                         formatdbCommand = "\"" + formatdbCommand + "\"";
362                 #endif
363                 //cout << formatdbCommand << endl;
364                 system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
365                                                                                                                                         //      option tells formatdb that seqs are DNA, not prot
366                 //m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOutEndLine(); cout.flush();
367         }
368         catch(exception& e) {
369                 m->errorOut(e, "BlastDB", "generateDB");
370                 exit(1);
371         }
372 }
373 /**************************************************************************************************/
374
375 /**************************************************************************************************/
376