}
/**************************************************************************************************/
//assumes you have added all the template sequences using the addSequence function and run generateDB.
-vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
try{
vector<int> topMatches;
+ float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
Scores.clear();
ofstream queryFile;
// the goal here is to quickly survey the database to find the closest match. To do this we are using the default
// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
// long. With this setting, it seems comparable in speed to the suffix tree approach.
-
+//7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
system(blastCommand.c_str());
ifstream m8FileHandle;
m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
- string dummy;
+ string dummy, eScore;
int templateAccession;
m->gobble(m8FileHandle);
while(!m8FileHandle.eof()){
- m8FileHandle >> dummy >> templateAccession >> searchScore;
- //cout << templateAccession << '\t' << searchScore << endl;
+ m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+ //cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t';
//get rest of junk in line
- while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
-
+ //while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; }else{ cout << c; } } //
+ //cout << endl;
m->gobble(m8FileHandle);
- topMatches.push_back(templateAccession);
- Scores.push_back(searchScore);
+ if (score >= minPerID) {
+ topMatches.push_back(templateAccession);
+ Scores.push_back(searchScore);
+ }
//cout << templateAccession << endl;
}
m8FileHandle.close();
void generateDB();
void addSequence(Sequence);
vector<int> findClosestSequences(Sequence*, int);
- vector<int> findClosestMegaBlast(Sequence*, int);
+ vector<int> findClosestMegaBlast(Sequence*, int, int);
private:
string dbFileName;
//find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
Sequence* newSeq = new Sequence(q->getName(), q->getAligned());
runFilter(newSeq);
- refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted);
+ refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted, minSim);
delete newSeq;
}else if (searchMethod == "blast") {
refSeqs = getBlastSeqs(q, thisTemplate, numWanted); //fills indexes
string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
- Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
- Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
-
- vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
- vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
+ Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+ Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+ vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1, minSim);
+ vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1, minSim);
+ cout << q->getName() << '\t' << leftQuery << '\t' << "leftMatches = " << tempIndexesLeft.size() << '\t' << rightQuery << " rightMatches = " << tempIndexesRight.size() << endl;
vector<int> smaller;
vector<int> larger;
CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch);
CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim);
CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov);
- CommandParameter pminsnp("minsnp", "Number", "", "100", "", "", "",false,false); parameters.push_back(pminsnp);
+ CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp);
CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit);
CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted);
- CommandParameter piters("iters", "Number", "", "100", "", "", "",false,false); parameters.push_back(piters);
+ CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence);
CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents);
CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement);
helpString += "The parents parameter allows you to select the number of potential parents to investigate from the numwanted best matches after rating them, default is 3. \n";
helpString += "The mismatch parameter allows you to penalize mismatched bases in blast search, default is -4. \n";
helpString += "The divergence parameter allows you to set a cutoff for chimera determination, default is 1.007. \n";
- helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=100.\n";
+ helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=1000.\n";
helpString += "The minsim parameter allows you to specify a minimum similarity with the parent fragments, default=90. \n";
helpString += "The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n";
helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n";
- helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n";
+ helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n";
helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n";
helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true. \n";
helpString += "The chimera.slayer command should be in the following format: \n";
search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; }
- temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; }
+ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
convert(temp, iters);
temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "5"; }
virtual void addSequence(Sequence) = 0; //add sequence to search engine
virtual string getName(int) { return ""; }
virtual vector<int> findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query
- virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
+ virtual vector<int> findClosestMegaBlast(Sequence*, int, int){return results;}
virtual float getSearchScore();
virtual vector<float> getSearchScores() { return Scores; } //assumes you already called findClosestMegaBlast
virtual int getLongestBase();
#include "dist.h"
#include "eachgapdist.h"
#include "ignoregaps.h"
-#include "eachgapdistignorens.h"
+#include "eachgapdist.h"
//***************************************************************************************************************
void DeCalculator::setMask(string ms) {
}
//***************************************************************************************************************
//gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) {
try {
//indexes.clear();
vector<SeqDist> distsLeft;
vector<SeqDist> distsRight;
- Dist* distcalculator = new eachGapDistIgnoreNs();
+ Dist* distcalculator = new eachGapDist();
string queryUnAligned = querySeq->getUnaligned();
int numBases = int(queryUnAligned.length() * 0.33);
}
+ //are we still above the minimum similarity cutoff
+ if ((lastLeft >= minSim) || (lastRight >= minSim)) {
+ //add in ties from left
+ int i = numWanted;
+ while (i < distsLeft.size()) {
+ if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); }
+ else { break; }
+ i++;
+ }
+
+ //add in ties from right
+ i = numWanted;
+ while (i < distsRight.size()) {
+ if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); }
+ else { break; }
+ i++;
+ }
+ }
+
- //numWanted = dists.size();
//cout << numWanted << endl;
for (int i = 0; i < dists.size(); i++) {
// cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
- if (thisTemplate[dists[i].index]->getName() != querySeq->getName()) {
+ if ((thisTemplate[dists[i].index]->getName() != querySeq->getName()) && (dists[i].dist >= minSim)) {
Sequence* temp = new Sequence(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
seqsMatches.push_back(temp);
Sequence* seqsMatch;
- Dist* distcalculator = new eachGapDistIgnoreNs();
+ Dist* distcalculator = new eachGapDist();
int index = 0;
int smallest = 1000000;
DeCalculator() { m = MothurOut::getInstance(); }
~DeCalculator() {};
- vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int); //takes querySeq, a reference db, filteredRefDB, numWanted
+ vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int, int); //takes querySeq, a reference db, filteredRefDB, numWanted, minSim
Sequence* findClosest(Sequence*, vector<Sequence*>);
set<int> getPos() { return h; }
void setMask(string);
temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
// cout << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << endl;
+
outputResults.push_back(temp);
}
}
}
/***********************************************************************/
-float Slayer::computePercentID(string queryFrag, string parent, int left, int right) {
+float Slayer::computePercentID(string queryAlign, string chimera, int left, int right) {
try {
- int total = 0;
- int matches = 0;
-
+
+ int numIdentical = 0;
+ int countA = 0;
+ int countB = 0;
for (int i = left; i <= right; i++) {
- total++;
- if (queryFrag[i] == parent[i]) {
- matches++;
+ if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
+ ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
+ else {
+
+ bool charA = false; bool charB = false;
+ if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
+ if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
+
+ if (charA || charB) {
+
+ if (charA) { countA++; }
+ if (charB) { countB++; }
+
+ if (queryAlign[i] == chimera[i]) {
+ numIdentical++;
+ }
+ }
}
}
-
- float percentID =( matches/(float)total) * 100;
- return percentID;
+ float numBases = (countA + countB) /(float) 2;
+
+ if (numBases == 0) { return 0; }
+
+ float percentIdentical = (numIdentical/(float)numBases) * 100;
+
+ return percentIdentical;
+
}
catch(exception& e) {
m->errorOut(e, "Slayer", "computePercentID");