From 1cf188b912d6da8f2cd03dd71cecef664a699c1a Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 4 May 2011 14:11:02 +0000 Subject: [PATCH] changes for chimera slayer --- blastdb.cpp | 21 ++++++++++++--------- blastdb.hpp | 2 +- chimeraslayer.cpp | 12 ++++++------ chimeraslayercommand.cpp | 10 +++++----- database.hpp | 2 +- decalc.cpp | 30 ++++++++++++++++++++++++------ decalc.h | 2 +- maligner.cpp | 1 + slayer.cpp | 40 ++++++++++++++++++++++++++++++---------- 9 files changed, 81 insertions(+), 39 deletions(-) diff --git a/blastdb.cpp b/blastdb.cpp index fdb5456..3e1210b 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -111,9 +111,10 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { } /**************************************************************************************************/ //assumes you have added all the template sequences using the addSequence function and run generateDB. -vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { +vector BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) { try{ vector topMatches; + float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score; Scores.clear(); ofstream queryFile; @@ -126,7 +127,7 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { // the goal here is to quickly survey the database to find the closest match. To do this we are using the default // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too // long. With this setting, it seems comparable in speed to the suffix tree approach. - +//7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131 string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); system(blastCommand.c_str()); @@ -134,20 +135,22 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { ifstream m8FileHandle; m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error"); - string dummy; + string dummy, eScore; int templateAccession; m->gobble(m8FileHandle); while(!m8FileHandle.eof()){ - m8FileHandle >> dummy >> templateAccession >> searchScore; - //cout << templateAccession << '\t' << searchScore << endl; + m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; + //cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t'; //get rest of junk in line - while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } } - + //while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; }else{ cout << c; } } // + //cout << endl; m->gobble(m8FileHandle); - topMatches.push_back(templateAccession); - Scores.push_back(searchScore); + if (score >= minPerID) { + topMatches.push_back(templateAccession); + Scores.push_back(searchScore); + } //cout << templateAccession << endl; } m8FileHandle.close(); diff --git a/blastdb.hpp b/blastdb.hpp index 844f7f1..9b8965d 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -23,7 +23,7 @@ public: void generateDB(); void addSequence(Sequence); vector findClosestSequences(Sequence*, int); - vector findClosestMegaBlast(Sequence*, int); + vector findClosestMegaBlast(Sequence*, int, int); private: string dbFileName; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 59b2fea..cd268d3 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -999,7 +999,7 @@ vector ChimeraSlayer::getRefSeqs(Sequence* q, vector& this //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate Sequence* newSeq = new Sequence(q->getName(), q->getAligned()); runFilter(newSeq); - refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted); + refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted, minSim); delete newSeq; }else if (searchMethod == "blast") { refSeqs = getBlastSeqs(q, thisTemplate, numWanted); //fills indexes @@ -1025,12 +1025,12 @@ vector ChimeraSlayer::getBlastSeqs(Sequence* q, vector& db string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence - Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery); - Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery); - - vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1); - vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1); + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); + Sequence* queryRight = new Sequence(q->getName(), rightQuery); + vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1, minSim); + vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1, minSim); + cout << q->getName() << '\t' << leftQuery << '\t' << "leftMatches = " << tempIndexesLeft.size() << '\t' << rightQuery << " rightMatches = " << tempIndexesRight.size() << endl; vector smaller; vector larger; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 9299f89..97edf11 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -23,7 +23,7 @@ vector ChimeraSlayerCommand::setParameters(){ CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch); CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim); CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov); - CommandParameter pminsnp("minsnp", "Number", "", "100", "", "", "",false,false); parameters.push_back(pminsnp); + CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp); CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs); CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); @@ -31,7 +31,7 @@ vector ChimeraSlayerCommand::setParameters(){ CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim); CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit); CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted); - CommandParameter piters("iters", "Number", "", "100", "", "", "",false,false); parameters.push_back(piters); + CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence); CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents); CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement); @@ -72,11 +72,11 @@ string ChimeraSlayerCommand::getHelpString(){ helpString += "The parents parameter allows you to select the number of potential parents to investigate from the numwanted best matches after rating them, default is 3. \n"; helpString += "The mismatch parameter allows you to penalize mismatched bases in blast search, default is -4. \n"; helpString += "The divergence parameter allows you to set a cutoff for chimera determination, default is 1.007. \n"; - helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=100.\n"; + helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=1000.\n"; helpString += "The minsim parameter allows you to specify a minimum similarity with the parent fragments, default=90. \n"; helpString += "The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"; helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"; - helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n"; + helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"; helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n"; helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true. \n"; helpString += "The chimera.slayer command should be in the following format: \n"; @@ -353,7 +353,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; } - temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } convert(temp, iters); temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "5"; } diff --git a/database.hpp b/database.hpp index f9c0c48..f974ae9 100644 --- a/database.hpp +++ b/database.hpp @@ -50,7 +50,7 @@ public: virtual void addSequence(Sequence) = 0; //add sequence to search engine virtual string getName(int) { return ""; } virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query - virtual vector findClosestMegaBlast(Sequence*, int){return results;} + virtual vector findClosestMegaBlast(Sequence*, int, int){return results;} virtual float getSearchScore(); virtual vector getSearchScores() { return Scores; } //assumes you already called findClosestMegaBlast virtual int getLongestBase(); diff --git a/decalc.cpp b/decalc.cpp index 4e38053..3b3740b 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -12,7 +12,7 @@ #include "dist.h" #include "eachgapdist.h" #include "ignoregaps.h" -#include "eachgapdistignorens.h" +#include "eachgapdist.h" //*************************************************************************************************************** void DeCalculator::setMask(string ms) { @@ -683,7 +683,7 @@ float DeCalculator::getCoef(vector obs, vector qav) { } //*************************************************************************************************************** //gets closest matches to each end, since chimeras will most likely have different parents on each end -vector DeCalculator::findClosest(Sequence* querySeq, vector& thisTemplate, vector& thisFilteredTemplate, int numWanted) { +vector DeCalculator::findClosest(Sequence* querySeq, vector& thisTemplate, vector& thisFilteredTemplate, int numWanted, int minSim) { try { //indexes.clear(); @@ -692,7 +692,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector distsLeft; vector distsRight; - Dist* distcalculator = new eachGapDistIgnoreNs(); + Dist* distcalculator = new eachGapDist(); string queryUnAligned = querySeq->getUnaligned(); int numBases = int(queryUnAligned.length() * 0.33); @@ -813,14 +813,32 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector= minSim) || (lastRight >= minSim)) { + //add in ties from left + int i = numWanted; + while (i < distsLeft.size()) { + if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); } + else { break; } + i++; + } + + //add in ties from right + i = numWanted; + while (i < distsRight.size()) { + if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); } + else { break; } + i++; + } + } + - //numWanted = dists.size(); //cout << numWanted << endl; for (int i = 0; i < dists.size(); i++) { // cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl; - if (thisTemplate[dists[i].index]->getName() != querySeq->getName()) { + if ((thisTemplate[dists[i].index]->getName() != querySeq->getName()) && (dists[i].dist >= minSim)) { Sequence* temp = new Sequence(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. seqsMatches.push_back(temp); @@ -841,7 +859,7 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector db) { Sequence* seqsMatch; - Dist* distcalculator = new eachGapDistIgnoreNs(); + Dist* distcalculator = new eachGapDist(); int index = 0; int smallest = 1000000; diff --git a/decalc.h b/decalc.h index 42e1441..40b3a10 100644 --- a/decalc.h +++ b/decalc.h @@ -39,7 +39,7 @@ class DeCalculator { DeCalculator() { m = MothurOut::getInstance(); } ~DeCalculator() {}; - vector findClosest(Sequence*, vector&, vector&, int); //takes querySeq, a reference db, filteredRefDB, numWanted + vector findClosest(Sequence*, vector&, vector&, int, int); //takes querySeq, a reference db, filteredRefDB, numWanted, minSim Sequence* findClosest(Sequence*, vector); set getPos() { return h; } void setMask(string); diff --git a/maligner.cpp b/maligner.cpp index 52fd36f..f834db1 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -151,6 +151,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion); // cout << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << endl; + outputResults.push_back(temp); } diff --git a/slayer.cpp b/slayer.cpp index 5ecb4ea..a244ea4 100644 --- a/slayer.cpp +++ b/slayer.cpp @@ -426,21 +426,41 @@ float Slayer::snpAB(vector data) { } } /***********************************************************************/ -float Slayer::computePercentID(string queryFrag, string parent, int left, int right) { +float Slayer::computePercentID(string queryAlign, string chimera, int left, int right) { try { - int total = 0; - int matches = 0; - + + int numIdentical = 0; + int countA = 0; + int countB = 0; for (int i = left; i <= right; i++) { - total++; - if (queryFrag[i] == parent[i]) { - matches++; + if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) || + ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {} + else { + + bool charA = false; bool charB = false; + if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; } + if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; } + + if (charA || charB) { + + if (charA) { countA++; } + if (charB) { countB++; } + + if (queryAlign[i] == chimera[i]) { + numIdentical++; + } + } } } - - float percentID =( matches/(float)total) * 100; - return percentID; + float numBases = (countA + countB) /(float) 2; + + if (numBases == 0) { return 0; } + + float percentIdentical = (numIdentical/(float)numBases) * 100; + + return percentIdentical; + } catch(exception& e) { m->errorOut(e, "Slayer", "computePercentID"); -- 2.39.2