5 * Created by Pat Schloss on 12/22/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
11 #include "database.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
15 /**************************************************************************************************/
17 BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(),
18 gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
20 globaldata = GlobalData::getInstance();
23 int randNumber = rand();
24 dbFileName = toString(randNumber) + ".template.unaligned.fasta";
25 queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
26 blastFileName = toString(randNumber) + ".blast";
30 /**************************************************************************************************/
33 remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files
34 remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files
35 remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
37 /**************************************************************************************************/
38 //assumes you have added all the template sequences using the addSequence function and run generateDB.
39 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
41 vector<int> topMatches;
44 openOutputFile(queryFileName, queryFile);
45 queryFile << '>' << seq->getName() << endl;
46 queryFile << seq->getUnaligned() << endl;
49 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
50 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
51 // long. With this setting, it seems comparable in speed to the suffix tree approach.
53 string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
54 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
55 system(blastCommand.c_str());
57 ifstream m8FileHandle;
58 openInputFile(blastFileName, m8FileHandle);
61 int templateAccession;
64 while(!m8FileHandle.eof()){
65 m8FileHandle >> dummy >> templateAccession >> searchScore;
67 //get rest of junk in line
68 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
71 topMatches.push_back(templateAccession);
75 string root = dbFileName;
76 string temp = dbFileName + ".nsq";
78 temp = dbFileName + ".nsi";
81 temp = dbFileName + ".nsd";
84 temp = dbFileName + ".nin";
87 temp = dbFileName + ".nhr";
95 errorOut(e, "BlastDB", "findClosestSequences");
100 /**************************************************************************************************/
101 //assumes you have added all the template sequences using the addSequence function and run generateDB.
102 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
104 vector<int> topMatches;
107 openOutputFile(queryFileName, queryFile);
108 queryFile << '>' << seq->getName() << endl;
109 queryFile << seq->getUnaligned() << endl;
112 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
113 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
114 // long. With this setting, it seems comparable in speed to the suffix tree approach.
116 string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
117 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
118 system(blastCommand.c_str());
120 ifstream m8FileHandle;
121 openInputFile(blastFileName, m8FileHandle, "no error");
124 int templateAccession;
125 gobble(m8FileHandle);
127 while(!m8FileHandle.eof()){
128 m8FileHandle >> dummy >> templateAccession >> searchScore;
130 //get rest of junk in line
131 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
133 gobble(m8FileHandle);
134 topMatches.push_back(templateAccession);
135 //cout << templateAccession << endl;
137 m8FileHandle.close();
141 catch(exception& e) {
142 errorOut(e, "BlastDB", "findClosest");
146 /**************************************************************************************************/
147 void BlastDB::addSequence(Sequence seq) {
150 ofstream unalignedFastaFile;
151 openOutputFileAppend(dbFileName, unalignedFastaFile);
153 // generating a fasta file with unaligned template
154 unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
155 unalignedFastaFile << seq.getUnaligned() << endl;
156 unalignedFastaFile.close();
160 catch(exception& e) {
161 errorOut(e, "BlastDB", "addSequence");
165 /**************************************************************************************************/
166 void BlastDB::generateDB() {
169 //mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
171 path = globaldata->argv;
172 path = path.substr(0, (path.find_last_of('m')));
174 string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
175 system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
176 // option tells formatdb that seqs are DNA, not prot
177 //mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
179 catch(exception& e) {
180 errorOut(e, "BlastDB", "generateDB");
185 /**************************************************************************************************/