From 0571957d68cbbc0e425af1db8e808f826010b9e2 Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 27 Apr 2011 19:37:40 +0000 Subject: [PATCH] working on slayer bug --- blastdb.cpp | 3 +- chimera.cpp | 2 + chimera.h | 2 +- chimeraslayer.cpp | 29 +++++---- cluster.cpp | 1 + database.hpp | 4 +- decalc.cpp | 12 ++-- distancedb.cpp | 3 + kmerdb.cpp | 4 ++ maligner.cpp | 158 +++++++++++++++++++++++++++++++++------------- slayer.cpp | 8 +-- 11 files changed, 155 insertions(+), 71 deletions(-) diff --git a/blastdb.cpp b/blastdb.cpp index 026c4ca..f162cdf 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -114,6 +114,7 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { try{ vector topMatches; + Scores.clear(); ofstream queryFile; @@ -146,7 +147,7 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { m->gobble(m8FileHandle); topMatches.push_back(templateAccession); - megaScores.push_back(searchScore); + Scores.push_back(searchScore); //cout << templateAccession << endl; } m8FileHandle.close(); diff --git a/chimera.cpp b/chimera.cpp index ae9ad9a..5412487 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -186,6 +186,8 @@ vector Chimera::readSeqs(string file) { m->mothurOut("Done."); m->mothurOutEndLine(); + filterString = (string(container[0]->getAligned().length(), '1')); + return container; } catch(exception& e) { diff --git a/chimera.h b/chimera.h index f827346..58d637e 100644 --- a/chimera.h +++ b/chimera.h @@ -136,7 +136,7 @@ class Chimera { public: - Chimera(){ m = MothurOut::getInstance(); length = 0; unaligned = false; } + Chimera(){ m = MothurOut::getInstance(); length = 0; unaligned = false; } virtual ~Chimera(){ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } }; virtual bool getUnaligned() { return unaligned; } virtual int getLength() { return length; } diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 73037c7..7aa7cd4 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -85,22 +85,22 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map tempQuerySeqs = readSeqs(fastafile); - //read in all query seqs - vector tempQuerySeqs = readSeqs(fastafile); + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - vector temp = templateSeqs; - for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + createFilter(temp, 0.0); //just removed columns where all seqs have a gap - createFilter(temp, 0.0); //just removed columns where all seqs have a gap + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - - if (m->control_pressed) { return 0; } - - //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } + if (m->control_pressed) { return 0; } + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } + } string kmerDBNameLeft; string kmerDBNameRight; @@ -751,7 +751,12 @@ int ChimeraSlayer::getChimeras(Sequence* query) { if (m->control_pressed) { return 0; } vector Results = maligner.getOutput(); - + + //cout << query->getName() << endl; + //for (int i = 0; i < Results.size(); i++) { + //cout << Results[i].parent << '\t' << Results[i].regionStart << '\t' << Results[i].regionEnd << '\t' << Results[i].nastRegionStart << '\t' << Results[i].nastRegionEnd << '\t' << Results[i].queryToParent << '\t' << Results[i].queryToParentLocal << endl; + //} + //cout << "done\n" << endl; if (chimeraFlag == "yes") { if (realign) { diff --git a/cluster.cpp b/cluster.cpp index 440562c..ac9f448 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -214,6 +214,7 @@ void Cluster::update(double& cutOFF){ // The vector has to be traversed in reverse order to preserve the index // for faster removal in removeCell() for (int i=nRowCells-1;i>=0;i--) { + //if you are not the smallCell if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) { if (rowCells[i]->row == smallRow) { search = rowCells[i]->column; diff --git a/database.hpp b/database.hpp index a31075b..f9c0c48 100644 --- a/database.hpp +++ b/database.hpp @@ -52,7 +52,7 @@ public: virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query virtual vector findClosestMegaBlast(Sequence*, int){return results;} virtual float getSearchScore(); - virtual vector getMegaBlastSearchScores() { return megaScores; } //assumes you already called findClosestMegaBlast + virtual vector getSearchScores() { return Scores; } //assumes you already called findClosestMegaBlast virtual int getLongestBase(); virtual void readKmerDB(ifstream&){}; virtual void setNumSeqs(int i) { numSeqs = i; } @@ -64,7 +64,7 @@ protected: int numSeqs, longest; float searchScore; vector results; - vector megaScores; + vector Scores; }; /**************************************************************************************************/ #endif diff --git a/decalc.cpp b/decalc.cpp index 2a1736a..52607fc 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -796,8 +796,8 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector dists; float lastRight = distsRight[0].dist; float lastLeft = distsLeft[0].dist; - int lasti = 0; - for (int i = 0; i < distsLeft.size(); i++) { + //int lasti = 0; + for (int i = 0; i < numWanted+1; i++) { //add left if you havent already it = seen.find(db[distsLeft[i].index]->getName()); if (it == seen.end()) { @@ -816,13 +816,13 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName() << '\t' << distsRight[i].dist << endl; } - if (dists.size() > numWanted) { lasti = i; break; } //you have enough results + //if (dists.size() > numWanted) { lasti = i; break; } //you have enough results } // cout << "lastLeft\t" << lastLeft << endl; //add in sequences with same distance as last sequence added - lasti++; + /* lasti++; int i = lasti; while (i < distsLeft.size()) { if (distsLeft[i].dist == lastLeft) { @@ -856,8 +856,8 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector dists.size()) { //m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); diff --git a/distancedb.cpp b/distancedb.cpp index 8d0c629..c1bf7e7 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -49,6 +49,7 @@ void DistanceDB::addSequence(Sequence seq) { vector DistanceDB::findClosestSequences(Sequence* query, int numWanted){ try { vector topMatches; + Scores.clear(); bool templateSameLength = true; string sequence = query->getAligned(); vector dists; @@ -87,6 +88,7 @@ vector DistanceDB::findClosestSequences(Sequence* query, int numWanted){ //fill topmatches with numwanted closest sequences indexes for (int i = 0; i < numWanted; i++) { topMatches.push_back(dists[i].seq2); + Scores.push_back(dists[i].dist); } }else { int bestIndex = 0; @@ -103,6 +105,7 @@ vector DistanceDB::findClosestSequences(Sequence* query, int numWanted){ } searchScore = smallDist; topMatches.push_back(bestIndex); + Scores.push_back(smallDist); } }else{ diff --git a/kmerdb.cpp b/kmerdb.cpp index e7e8ab2..2703e16 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -60,6 +60,7 @@ vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ vector topMatches; Kmer kmer(kmerSize); searchScore = 0; + Scores.clear(); vector matches(numSeqs, 0); // a record of the sequences with shared kmers vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found @@ -92,6 +93,8 @@ vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ //save top matches for (int i = 0; i < num; i++) { topMatches.push_back(seqMatches[i].seq); + float thisScore = 100 * seqMatches[i].match / (float) numKmers; + Scores.push_back(thisScore); } }else{ int bestIndex = 0; @@ -107,6 +110,7 @@ vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ searchScore = bestMatch; searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db topMatches.push_back(bestIndex); + Scores.push_back(searchScore); } return topMatches; } diff --git a/maligner.cpp b/maligner.cpp index 7d7145f..c5cc83a 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -48,7 +48,9 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { if (m->control_pressed) { return chimera; } refSeqs = minCoverageFilter(refSeqs); - + + + if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -56,7 +58,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { } int chimeraPenalty = computeChimeraPenalty(); - + //cout << "chimeraPenalty = " << chimeraPenalty << endl; //fills outputResults chimera = chimeraMaligner(chimeraPenalty, decalc); @@ -89,8 +91,9 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { vector temp = refSeqs; temp.push_back(query); - verticalFilter(temp); + + //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl << refSeqs[i]->getAligned() << endl; } vector< vector > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes @@ -100,10 +103,10 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { vector trace = extractHighestPath(matrix); -// cout << "traces\n"; -// for(int i=0;igetName() << endl; -// } + //cout << "traces\n"; + //for(int i=0;igetName() << endl; + //} if (trace.size() > 1) { chimera = "yes"; } else { chimera = "no"; return chimera; } @@ -315,7 +318,7 @@ void Maligner::fillScoreMatrix(vector >& ms, vector >& ms, vector >& ms, vectorgetName(); -// for(int j=0;jgetName(); -// for(int j=0;jgetName(); -// for(int j=0;jgetName(); + for(int j=0;jgetName() << endl << seqs[i]->getAligned() << endl << endl; + if ((seqs[i]->getName() == "S000003470") || (seqs[i]->getName() == "S000383265") || (seqs[i]->getName() == "7000004128191054")) { + for(int j=0;jgetName(); + for(int j=0;j Maligner::extractHighestPath(vector > } } -// cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; + //cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; vector maxTrace; double maxPercentIdenticalQueryAntiChimera = 0; @@ -450,10 +456,10 @@ vector Maligner::extractHighestPath(vector > vector trace = mapTraceRegionsToAlignment(path, refSeqs); -// cout << "traces\n"; -// for(int j=0;jgetName() << endl; -// } + //cout << "traces\n"; + //for(int j=0;jgetName() << endl; + //} int traceStart = path[0].col; int traceEnd = path[path.size()-1].col; @@ -476,6 +482,7 @@ vector Maligner::extractHighestPath(vector > } // cout << maxPercentIdenticalQueryAntiChimera << endl; return maxTrace; + } catch(exception& e) { @@ -633,9 +640,9 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery); vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1); - vector leftScores = databaseLeft->getMegaBlastSearchScores(); + vector leftScores = databaseLeft->getSearchScores(); vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1); - vector rightScores = databaseLeft->getMegaBlastSearchScores(); + vector rightScores = databaseLeft->getSearchScores(); //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; } @@ -676,11 +683,11 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { } lasti = i; - if (mergedResults.size() > num) { break; } + //if (mergedResults.size() > num) { break; } } //save lasti for smaller ties below - lasti++; + /*lasti++; int iSmaller = lasti; if (!(mergedResults.size() > num)) { //if we still need more results. @@ -726,7 +733,17 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { else { break; } lasti++; } + */ + for (int i = smaller.size(); i < larger.size(); i++) { + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; + } + } numWanted = mergedResults.size(); if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); } @@ -738,6 +755,7 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); refResults.push_back(temp); indexes.push_back(mergedResults[i]); + //cout << db[mergedResults[i]]->getName() << endl; } //cout << mergedResults[i] << endl; @@ -768,11 +786,15 @@ vector Maligner::getKmerSeqs(Sequence* q, int num) { vector tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted); vector tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted); + vector scoresLeft = databaseLeft->getSearchScores(); + vector scoresRight = databaseRight->getSearchScores(); //merge results map seen; map::iterator it; - + float lastRight = scoresRight[0]; + float lastLeft = scoresLeft[0]; + //int lasti = 0; vector mergedResults; for (int i = 0; i < tempIndexesLeft.size(); i++) { //add left if you havent already @@ -780,6 +802,7 @@ vector Maligner::getKmerSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(tempIndexesLeft[i]); seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + lastLeft = scoresLeft[i]; } //add right if you havent already @@ -787,16 +810,61 @@ vector Maligner::getKmerSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(tempIndexesRight[i]); seen[tempIndexesRight[i]] = tempIndexesRight[i]; + lastRight = scoresRight[i]; + } + + //if (mergedResults.size() > numWanted) { lasti = i; break; } //you have enough results + } + + //add in sequences with same distance as last sequence added + /*lasti++; + int i = lasti; + while (i < tempIndexesLeft.size()) { + if (scoresLeft[i] == lastLeft) { + it = seen.find(tempIndexesLeft[i]); + + if (it == seen.end()) { + mergedResults.push_back(tempIndexesLeft[i]); + seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + } + } + else { break; } + i++; + } + + // cout << "lastRight\t" << lastRight << endl; + //add in sequences with same distance as last sequence added + i = lasti; + while (i < tempIndexesRight.size()) { + if (scoresRight[i] == lastRight) { + it = seen.find(tempIndexesRight[i]); + + if (it == seen.end()) { + mergedResults.push_back(tempIndexesRight[i]); + seen[tempIndexesRight[i]] = tempIndexesRight[i]; + } } + else { break; } + i++; + }*/ + + numWanted = mergedResults.size(); + + if (numWanted > mergedResults.size()) { + //m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); + numWanted = mergedResults.size(); } + //cout << q->getName() << endl; vector refResults; for (int i = 0; i < numWanted; i++) { //cout << db[mergedResults[i]]->getName() << endl; - Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); - refResults.push_back(temp); - indexes.push_back(mergedResults[i]); + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + indexes.push_back(mergedResults[i]); + } } //cout << endl; delete queryRight; diff --git a/slayer.cpp b/slayer.cpp index ba1bf33..8575faf 100644 --- a/slayer.cpp +++ b/slayer.cpp @@ -297,8 +297,8 @@ int Slayer::bootstrapSNPS(vector left, vector right, float& BSA, flo int count_A = 0; // sceneario QLA,QRB supported int count_B = 0; // sceneario QLB,QRA supported - int numLeft = max(1, int(left.size() * (percentSNPSample/(float)100) + 0.5)); - int numRight = max(1, int(right.size() * (percentSNPSample/(float)100) + 0.5)); + int numLeft = max(1, int(left.size() * percentSNPSample/(float)100 + 0.5)); + int numRight = max(1, int(right.size() * percentSNPSample/(float)100 + 0.5)); for (int i = 0; i < iters; i++) { //random sampling with replacement. @@ -365,8 +365,8 @@ int Slayer::bootstrapSNPS(vector left, vector right, float& BSA, flo - BSA = ((float) count_A / (float) iters) * 100; - BSB = ((float) count_B / (float) iters) * 100; + BSA = (float) count_A / (float) iters * 100; + BSB = (float) count_B / (float) iters * 100; //cout << "bsa = " << BSA << " bsb = " << BSB << endl; return 0; -- 2.39.2