]> git.donarmstrong.com Git - mothur.git/blob - blastdb.cpp
396a6b4b91c4496b80dfc3009cdda4a0f34b12e6
[mothur.git] / blastdb.cpp
1 /*
2  *  blastdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/22/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
14
15 /**************************************************************************************************/
16
17 BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(), 
18 gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
19         
20         globaldata = GlobalData::getInstance();
21         count = 0;
22
23         int randNumber = rand();
24         dbFileName = toString(randNumber) + ".template.unaligned.fasta";
25         queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
26         blastFileName = toString(randNumber) + ".blast";
27
28 }
29
30 /**************************************************************************************************/
31
32 BlastDB::~BlastDB(){
33         remove(queryFileName.c_str());                          //      let's clean stuff up and remove the temp files
34         remove(dbFileName.c_str());                                     //      let's clean stuff up and remove the temp files
35         remove(blastFileName.c_str());                          //      let's clean stuff up and remove the temp files
36 }
37 /**************************************************************************************************/
38 //assumes you have added all the template sequences using the addSequence function and run generateDB.
39 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
40         try{
41                 vector<int> topMatches;
42                 
43                 ofstream queryFile;
44                 openOutputFile(queryFileName, queryFile);
45                 queryFile << '>' << seq->getName() << endl;
46                 queryFile << seq->getUnaligned() << endl;
47                 queryFile.close();
48                                 
49                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
50                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
51                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
52                 
53                 string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
54                 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
55                 system(blastCommand.c_str());
56                 
57                 ifstream m8FileHandle;
58                 openInputFile(blastFileName, m8FileHandle);
59                 
60                 string dummy;
61                 int templateAccession;
62                 gobble(m8FileHandle);
63                 
64                 while(!m8FileHandle.eof()){
65                         m8FileHandle >> dummy >> templateAccession >> searchScore;
66                         
67                         //get rest of junk in line
68                         while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
69                         
70                         gobble(m8FileHandle);
71                         topMatches.push_back(templateAccession);
72                 }
73                 m8FileHandle.close();
74                 
75                 string root = dbFileName;
76                 string temp = dbFileName + ".nsq";
77                 remove(temp.c_str());   
78                 temp = dbFileName + ".nsi";
79                 remove(temp.c_str());
80                 
81                 temp = dbFileName + ".nsd";
82                 remove(temp.c_str());   
83
84                 temp = dbFileName + ".nin";
85                 remove(temp.c_str());   
86
87                 temp = dbFileName + ".nhr";
88                 remove(temp.c_str());   
89         
90
91                 
92                 return topMatches;
93         }
94         catch(exception& e) {
95                 errorOut(e, "BlastDB", "findClosestSequences");
96                 exit(1);
97         }
98
99 }
100 /**************************************************************************************************/
101 //assumes you have added all the template sequences using the addSequence function and run generateDB.
102 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
103         try{
104                 vector<int> topMatches;
105                 
106                 ofstream queryFile;
107                 openOutputFile(queryFileName, queryFile);
108                 queryFile << '>' << seq->getName() << endl;
109                 queryFile << seq->getUnaligned() << endl;
110                 queryFile.close();
111                                 
112                 //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
113                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
114                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
115         
116                 string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
117                 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
118                 system(blastCommand.c_str());
119                 
120                 ifstream m8FileHandle;
121                 openInputFile(blastFileName, m8FileHandle, "no error");
122         
123                 string dummy;
124                 int templateAccession;
125                 gobble(m8FileHandle);
126                 
127                 while(!m8FileHandle.eof()){
128                         m8FileHandle >> dummy >> templateAccession >> searchScore;
129                         
130                         //get rest of junk in line
131                         while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
132                         
133                         gobble(m8FileHandle);
134                         topMatches.push_back(templateAccession);
135 //cout << templateAccession << endl;
136                 }
137                 m8FileHandle.close();
138 //cout << "\n\n" ;              
139                 return topMatches;
140         }
141         catch(exception& e) {
142                 errorOut(e, "BlastDB", "findClosest");
143                 exit(1);
144         }
145 }
146 /**************************************************************************************************/
147 void BlastDB::addSequence(Sequence seq) {
148         try {
149         
150                 ofstream unalignedFastaFile;
151                 openOutputFileAppend(dbFileName, unalignedFastaFile);                           
152         
153                 //      generating a fasta file with unaligned template
154                 unalignedFastaFile << '>' << count << endl;                                     //      sequences, which will be input to formatdb
155                 unalignedFastaFile << seq.getUnaligned() << endl;
156                 unalignedFastaFile.close();
157                 
158                 count++;
159         }
160         catch(exception& e) {
161                 errorOut(e, "BlastDB", "addSequence");
162                 exit(1);
163         }
164 }
165 /**************************************************************************************************/
166 void BlastDB::generateDB() {
167         try {
168         
169                 //mothurOut("Generating the temporary BLAST database...\t");    cout.flush();
170                 
171                 path = globaldata->argv;
172                 path = path.substr(0, (path.find_last_of('m')));
173         
174                 string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName;        //      format the database, -o option gives us the ability
175                 system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
176                                                                                                                                         //      option tells formatdb that seqs are DNA, not prot
177                 //mothurOut("DONE."); mothurOutEndLine();       mothurOutEndLine(); cout.flush();
178         }
179         catch(exception& e) {
180                 errorOut(e, "BlastDB", "generateDB");
181                 exit(1);
182         }
183 }
184
185 /**************************************************************************************************/
186