From: westcott Date: Mon, 25 Apr 2011 11:31:20 +0000 (+0000) Subject: fixed ties issue in maligner getBlastSeqs X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=39a0a86958038d71083fad3a1a50c02ba9c2fc67 fixed ties issue in maligner getBlastSeqs --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index ab82683..217151b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -1507,9 +1507,9 @@ A7E9B68512D37EC400DA6239 /* chimerarealigner.h */, A7E9B68812D37EC400DA6239 /* chimeraslayer.cpp */, A7E9B68912D37EC400DA6239 /* chimeraslayer.h */, + A7E9B6C212D37EC400DA6239 /* decalc.h */, A7E9B6C112D37EC400DA6239 /* decalc.cpp */, A7E9B74612D37EC400DA6239 /* maligner.h */, - A7E9B6C212D37EC400DA6239 /* decalc.h */, A7E9B74512D37EC400DA6239 /* maligner.cpp */, A7E9B79312D37EC400DA6239 /* pintail.cpp */, A7E9B79412D37EC400DA6239 /* pintail.h */, diff --git a/blastdb.cpp b/blastdb.cpp index 14a1fa3..026c4ca 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -146,6 +146,7 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { m->gobble(m8FileHandle); topMatches.push_back(templateAccession); + megaScores.push_back(searchScore); //cout << templateAccession << endl; } m8FileHandle.close(); diff --git a/blastdb.hpp b/blastdb.hpp index 9398ba0..844f7f1 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -36,6 +36,7 @@ private: float gapExtend; float match; float misMatch; + }; #endif diff --git a/commandfactory.cpp b/commandfactory.cpp index fbafe57..63e0690 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -184,7 +184,7 @@ CommandFactory::CommandFactory(){ commands["trim.flows"] = "trim.flows"; commands["list.seqs"] = "list.seqs"; commands["get.seqs"] = "get.seqs"; - commands["remove.seqs"] = "get.seqs"; + commands["remove.seqs"] = "remove.seqs"; commands["system"] = "system"; commands["align.check"] = "align.check"; commands["get.sharedseqs"] = "get.sharedseqs"; diff --git a/database.hpp b/database.hpp index bbe01c4..a31075b 100644 --- a/database.hpp +++ b/database.hpp @@ -52,6 +52,7 @@ public: virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query virtual vector findClosestMegaBlast(Sequence*, int){return results;} virtual float getSearchScore(); + virtual vector getMegaBlastSearchScores() { return megaScores; } //assumes you already called findClosestMegaBlast virtual int getLongestBase(); virtual void readKmerDB(ifstream&){}; virtual void setNumSeqs(int i) { numSeqs = i; } @@ -63,6 +64,7 @@ protected: int numSeqs, longest; float searchScore; vector results; + vector megaScores; }; /**************************************************************************************************/ #endif diff --git a/decalc.cpp b/decalc.cpp index 1442bd8..609b895 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -821,7 +821,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector DeCalculator::findClosest(Sequence* querySeq, vector setParameters(); - string getCommandName() { return "get.seqs"; } + string getCommandName() { return "list.seqs"; } string getCommandCategory() { return "Sequence Processing"; } string getHelpString(); diff --git a/maligner.cpp b/maligner.cpp index 18f7409..4a00c1d 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -633,20 +633,30 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery); vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1); + vector leftScores = databaseLeft->getMegaBlastSearchScores(); vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1); + vector rightScores = databaseLeft->getMegaBlastSearchScores(); //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; } vector smaller; + vector smallerScores; vector larger; + vector largerScores; - if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; larger = tempIndexesLeft; } - else { smaller = tempIndexesLeft; larger = tempIndexesRight; } + if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; smallerScores = rightScores; larger = tempIndexesLeft; largerScores = leftScores; } + else { smaller = tempIndexesLeft; smallerScores = leftScores; larger = tempIndexesRight; largerScores = rightScores; } + + //for (int i = 0; i < smaller.size(); i++) { cout << "smaller = " << smaller[i] << '\t' << smallerScores[i] << endl; } + //cout << endl; + //for (int i = 0; i < larger.size(); i++) { cout << "larger = " << larger[i] << '\t' << largerScores[i] << endl; } //merge results map seen; map::iterator it; - + float lastSmaller = smallerScores[0]; + float lastLarger = largerScores[0]; + int lasti = 0; vector mergedResults; for (int i = 0; i < smaller.size(); i++) { //add left if you havent already @@ -654,6 +664,7 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(smaller[i]); seen[smaller[i]] = smaller[i]; + lastSmaller = smallerScores[i]; } //add right if you havent already @@ -661,17 +672,63 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(larger[i]); seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; } + + lasti = i; + if (mergedResults.size() > num) { break; } } - for (int i = smaller.size(); i < larger.size(); i++) { - it = seen.find(larger[i]); - if (it == seen.end()) { - mergedResults.push_back(larger[i]); - seen[larger[i]] = larger[i]; + //save lasti for smaller ties below + lasti++; + int iSmaller = lasti; + + if (!(mergedResults.size() > num)) { //if we still need more results. + for (int i = smaller.size(); i < larger.size(); i++) { + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; + } + + lasti = i; + if (mergedResults.size() > num) { break; } } } + + //add in any ties from smaller + while (iSmaller < smaller.size()) { + if (smallerScores[iSmaller] == lastSmaller) { + it = seen.find(smaller[iSmaller]); + + if (it == seen.end()) { + mergedResults.push_back(smaller[iSmaller]); + seen[smaller[iSmaller]] = smaller[iSmaller]; + } + } + else { break; } + iSmaller++; + } + + lasti++; + //add in any ties from larger + while (lasti < larger.size()) { + if (largerScores[lasti] == lastLarger) { //is it a tie + it = seen.find(larger[lasti]); + + if (it == seen.end()) { //we haven't already seen it + mergedResults.push_back(larger[lasti]); + seen[larger[lasti]] = larger[lasti]; + } + } + else { break; } + lasti++; + } + + numWanted = seen.size(); + if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); } //cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted << endl; for (int i = 0; i < numWanted; i++) {