// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
// long. With this setting, it seems comparable in speed to the suffix tree approach.
- string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+ string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
system(blastCommand.c_str());
}
/**************************************************************************************************/
//assumes you have added all the template sequences using the addSequence function and run generateDB.
-map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
try{
- map<int, float> topMatches;
+ vector<int> topMatches;
ofstream queryFile;
openOutputFile(queryFileName, queryFile);
// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
// long. With this setting, it seems comparable in speed to the suffix tree approach.
- string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n);
+ string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
system(blastCommand.c_str());
string dummy;
int templateAccession;
gobble(m8FileHandle);
-//string name = seq->getName();
-//ofstream out;
-//openOutputFileAppend(name, out);
+
while(!m8FileHandle.eof()){
m8FileHandle >> dummy >> templateAccession >> searchScore;
-//out << dummy << '\t' << templateAccession << '\t' << searchScore << endl;
+
//get rest of junk in line
- while (!m8FileHandle.eof()) { char c = m8FileHandle.get();
- //out << c;
- if (c == 10 || c == 13){ break; } }
+ while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
gobble(m8FileHandle);
- topMatches[templateAccession] = searchScore;
+ topMatches.push_back(templateAccession);
}
m8FileHandle.close();
-//out.close();
+
return topMatches;
}
catch(exception& e) {
void generateDB();
void addSequence(Sequence);
vector<int> findClosestSequences(Sequence*, int);
- map<int, float> findClosest(Sequence*, int); //template index -> searchscore
+ vector<int> findClosestMegaBlast(Sequence*, int);
private:
string dbFileName;
float divR;
};
/***********************************************************************/
-struct rank {
- int num;
- float score;
- rank(int n, float s) : num(n), score(s) {}
-};
-/***********************************************************************/
-
struct SeqDist {
Sequence* seq;
float dist;
};
//********************************************************************************************************************
-//sorts highest to lowest
-inline bool compareMembers(rank left, rank right){
- return (left.score > right.score);
-}
-//********************************************************************************************************************
//sorts lowest to highest
inline bool compareRegionStart(results left, results right){
return (left.nastRegionStart < right.nastRegionStart);
sort(parents.begin(), parents.end(), compareRegionStart);
//make sure you don't cutoff beginning of query
- if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart+1); }
+ if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart); }
int longest = 0;
//take query and break apart into pieces using breakpoints given by results of parents
for (int i = 0; i < parents.size(); i++) {
- int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
+ cout << parents[i].parent << '\t' << parents[i].nastRegionStart << '\t' << parents[i].nastRegionEnd << endl;
+ int length = parents[i].nastRegionEnd - parents[i].nastRegionStart;
string q = qAligned.substr(parents[i].nastRegionStart, length);
+ cout << "query = " << q << endl;
Sequence* queryFrag = new Sequence(query->getName(), q);
queryParts.push_back(queryFrag);
Sequence* parent = getSequence(parents[i].parent);
string p = parent->getAligned();
+
p = p.substr(parents[i].nastRegionStart, length);
+ cout << "parent = " << p << endl;
parent->setAligned(p);
parentParts.push_back(parent);
newQuery += queryParts[i]->getAligned();
}
+
//make sure you don't cutoff end of query
- if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1); }
+ if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd); }
//set query to new aligned string
query->setAligned(newQuery);
vector<results> Results = maligner->getOutput();
//realign query to parents to improve slayers detection rate
- //ChimeraReAligner realigner(templateSeqs, match, misMatch);
- //realigner.reAlign(query, Results);
-
+ ChimeraReAligner realigner(templateSeqs, match, misMatch);
+ realigner.reAlign(query, Results);
+cout << query->getName() << '\n' << query->getAligned() << endl;
//if (chimeraFlag == "yes") {
//get sequence that were given from maligner results
virtual void generateDB() = 0;
virtual void addSequence(Sequence) = 0; //add sequence to search engine
virtual vector<int> findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query
- virtual map<int, float> findClosest(Sequence*, int){ return results; } // returns of n closest sequences to query and their search scores
+ virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
virtual float getSearchScore();
virtual int getLongestBase();
virtual void readKmerDB(ifstream&){};
protected:
int numSeqs, longest;
float searchScore;
- map<int, float> results;
+ vector<int> results;
};
/**************************************************************************************************/
#endif
refSeqs = getBlastSeqs(query, numWanted);
}
-//ofstream out;
-//string name = toString(numi+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = name + ".seqsformaligner";
-//openOutputFile(filename, out);
-//for (int u = 0; u < refSeqs.size(); u++) { refSeqs[u]->printSequence(out); }
-//out.close();
-//filename = name + ".fasta";
-//openOutputFile(filename, out);
-//query->printSequence(out);
-//out.close();
-
-//for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl; }
-//cout << "before = " << refSeqs.size() << endl;
refSeqs = minCoverageFilter(refSeqs);
-//cout << "after = " << refSeqs.size() << endl;
+
if (refSeqs.size() < 2) {
for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
percentIdenticalQueryChimera = 0.0;
}
int chimeraPenalty = computeChimeraPenalty();
-//cout << chimeraPenalty << endl;
+
//fills outputResults
chimera = chimeraMaligner(chimeraPenalty, decalc);
}
//***************************************************************************************************************
vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
- try {
-cout << q->getName() << endl;
+ try {
//generate blastdb
Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); }
database->setNumSeqs(db.size());
//get parts of query
- string queryAligned = q->getAligned();
- string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
- string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
-
+ string queryUnAligned = q->getUnaligned();
+ string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+ string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
Sequence* queryRight = new Sequence(q->getName(), rightQuery);
- map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
- map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
-
- //merge results
- vector<rank> mergedResults;
+ vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
+ vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
- map<int, float>::iterator it;
- map<int, float>::iterator it2;
+ //merge results
+ map<int, int> seen;
+ map<int, int>::iterator it;
- //add in right guys merging common finds
- for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
- it2 = tempIndexesLeft.find(it->first);
-
- if (it2 == tempIndexesLeft.end()) { //result only present in right
- rank temp(it->first, it->second);
- mergedResults.push_back(temp);
+ vector<int> mergedResults;
+ for (int i = 0; i < tempIndexesLeft.size(); i++) {
+ //add left if you havent already
+ it = seen.find(tempIndexesLeft[i]);
+ if (it == seen.end()) {
+ mergedResults.push_back(tempIndexesLeft[i]);
+ seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
+ }
- }else { //result present in both save best score
- float bestscore;
- if (it->second > it2->second) { bestscore = it->second; }
- else { bestscore = it2->second; }
-
- rank temp(it->first, bestscore);
- mergedResults.push_back(temp);
-
- tempIndexesLeft.erase(it2);
+ //add right if you havent already
+ it = seen.find(tempIndexesRight[i]);
+ if (it == seen.end()) {
+ mergedResults.push_back(tempIndexesRight[i]);
+ seen[tempIndexesRight[i]] = tempIndexesRight[i];
}
}
- //add in unique left guys
- for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
- rank temp(it->first, it->second);
- mergedResults.push_back(temp);
- }
-
- sort(mergedResults.begin(), mergedResults.end(), compareMembers);
vector<Sequence*> refResults;
for (int i = 0; i < numWanted; i++) {
- Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
+ Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
refResults.push_back(temp);
-cout << db[mergedResults[i].num]->getName() << endl;
}
delete queryRight;