]> git.donarmstrong.com Git - mothur.git/commitdiff
changes for chimera slayer
authorwestcott <westcott>
Wed, 4 May 2011 14:11:02 +0000 (14:11 +0000)
committerwestcott <westcott>
Wed, 4 May 2011 14:11:02 +0000 (14:11 +0000)
blastdb.cpp
blastdb.hpp
chimeraslayer.cpp
chimeraslayercommand.cpp
database.hpp
decalc.cpp
decalc.h
maligner.cpp
slayer.cpp

index fdb5456050bcce78b620e9d3d697a334626a54ae..3e1210b46abaa4c01314a8702feea3eded345c11 100644 (file)
@@ -111,9 +111,10 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 }
 /**************************************************************************************************/
 //assumes you have added all the template sequences using the addSequence function and run generateDB.
-vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
        try{
                vector<int> topMatches;
+               float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
                Scores.clear();
                
                ofstream queryFile;
@@ -126,7 +127,7 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
                //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
                //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
                //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
-       
+//7000004128189528left 0       100             66      0       0       1       66      61      126     1e-31    131    
                string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
                blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
                system(blastCommand.c_str());
@@ -134,20 +135,22 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
                ifstream m8FileHandle;
                m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
        
-               string dummy;
+               string dummy, eScore;
                int templateAccession;
                m->gobble(m8FileHandle);
                
                while(!m8FileHandle.eof()){
-                       m8FileHandle >> dummy >> templateAccession >> searchScore;
-                       //cout << templateAccession << '\t' << searchScore << endl;
+                       m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+                       //cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t';
                        
                        //get rest of junk in line
-                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
-                       
+                       //while (!m8FileHandle.eof())   {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }else{ cout << c; }     } //
+                               //cout << endl;
                        m->gobble(m8FileHandle);
-                       topMatches.push_back(templateAccession);
-                       Scores.push_back(searchScore);
+                       if (score >= minPerID) {
+                               topMatches.push_back(templateAccession);
+                               Scores.push_back(searchScore);
+                       }
 //cout << templateAccession << endl;
                }
                m8FileHandle.close();
index 844f7f1fdc78abe8e3a1e38ca6045e1584a454db..9b8965d017b69a8f8cf27884261ea1095f77e176 100644 (file)
@@ -23,7 +23,7 @@ public:
        void generateDB();
        void addSequence(Sequence);
        vector<int> findClosestSequences(Sequence*, int);
-       vector<int> findClosestMegaBlast(Sequence*, int);
+       vector<int> findClosestMegaBlast(Sequence*, int, int);
        
 private:
        string dbFileName;
index 59b2fea4fa895a10ffe6ff9f462c448c7fa88014..cd268d3d6f1a513e932940332bab8ed15c59ada1 100644 (file)
@@ -999,7 +999,7 @@ vector<Sequence*> ChimeraSlayer::getRefSeqs(Sequence* q, vector<Sequence*>& this
                        //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
                        Sequence* newSeq = new Sequence(q->getName(), q->getAligned());
                        runFilter(newSeq);
-                       refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted);
+                       refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted, minSim);
                        delete newSeq;
                }else if (searchMethod == "blast")  {
                        refSeqs = getBlastSeqs(q, thisTemplate, numWanted); //fills indexes
@@ -1025,12 +1025,12 @@ vector<Sequence*> ChimeraSlayer::getBlastSeqs(Sequence* q, vector<Sequence*>& db
                string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
                string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
                
-               Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
-               Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
-               
-               vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
-               vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
+               Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+               Sequence* queryRight = new Sequence(q->getName(), rightQuery);
                
+               vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1, minSim);
+               vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1, minSim);
+               cout << q->getName() << '\t' << leftQuery << '\t' << "leftMatches = " << tempIndexesLeft.size() << '\t' << rightQuery   << " rightMatches = " << tempIndexesRight.size() << endl;
                vector<int> smaller;
                vector<int> larger;
                
index 9299f897657c993e92707216f03ec3e8590ec7f2..97edf1130be292c1495811a4a9c52acc77cde04b 100644 (file)
@@ -23,7 +23,7 @@ vector<string> ChimeraSlayerCommand::setParameters(){
                CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch);
                CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim);
                CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov);
-               CommandParameter pminsnp("minsnp", "Number", "", "100", "", "", "",false,false); parameters.push_back(pminsnp);
+               CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp);
                CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
                CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
@@ -31,7 +31,7 @@ vector<string> ChimeraSlayerCommand::setParameters(){
                CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
                CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit);
                CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted);
-               CommandParameter piters("iters", "Number", "", "100", "", "", "",false,false); parameters.push_back(piters);
+               CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence);
                CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents);
                CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement);
@@ -72,11 +72,11 @@ string ChimeraSlayerCommand::getHelpString(){
                helpString += "The parents parameter allows you to select the number of potential parents to investigate from the numwanted best matches after rating them, default is 3. \n";
                helpString += "The mismatch parameter allows you to penalize mismatched bases in blast search, default is -4. \n";
                helpString += "The divergence parameter allows you to set a cutoff for chimera determination, default is 1.007. \n";
-               helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=100.\n";
+               helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=1000.\n";
                helpString += "The minsim parameter allows you to specify a minimum similarity with the parent fragments, default=90. \n";
                helpString += "The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n";
                helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n";
-               helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n";
+               helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n";
                helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n";
                helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true.  \n";
                helpString += "The chimera.slayer command should be in the following format: \n";
@@ -353,7 +353,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                        
                        search = validParameter.validFile(parameters, "search", false);                 if (search == "not found") { search = "distance"; }
                        
-                       temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "100"; }              
+                       temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }             
                        convert(temp, iters); 
                         
                        temp = validParameter.validFile(parameters, "increment", false);                if (temp == "not found") { temp = "5"; }
index f9c0c4878c31a67e2df415387f5ee5cf3bbda939..f974ae966ed5f85c1136ed59a0c82b79f2f847d2 100644 (file)
@@ -50,7 +50,7 @@ public:
        virtual void addSequence(Sequence) = 0;  //add sequence to search engine
        virtual string getName(int) { return ""; }  
        virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
-       virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
+       virtual vector<int> findClosestMegaBlast(Sequence*, int, int){return results;}
        virtual float getSearchScore();
        virtual vector<float> getSearchScores() { return Scores; } //assumes you already called findClosestMegaBlast
        virtual int getLongestBase(); 
index 4e38053beff596047cb56a7a50cf283b11d92f99..3b3740bcea6682e14b21300a48e07fdb9f7079a3 100644 (file)
@@ -12,7 +12,7 @@
 #include "dist.h"
 #include "eachgapdist.h"
 #include "ignoregaps.h"
-#include "eachgapdistignorens.h"
+#include "eachgapdist.h"
 
 //***************************************************************************************************************
 void DeCalculator::setMask(string ms) { 
@@ -683,7 +683,7 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
 }
 //***************************************************************************************************************
 //gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) {
        try {
                //indexes.clear();
                
@@ -692,7 +692,7 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                vector<SeqDist> distsLeft;
                vector<SeqDist> distsRight;
                
-               Dist* distcalculator = new eachGapDistIgnoreNs();
+               Dist* distcalculator = new eachGapDist();
                
                string queryUnAligned = querySeq->getUnaligned();
                int numBases = int(queryUnAligned.length() * 0.33);
@@ -813,14 +813,32 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                        
                }
                
+               //are we still above the minimum similarity cutoff
+               if ((lastLeft >= minSim) || (lastRight >= minSim)) {
+                       //add in ties from left
+                       int i = numWanted;
+                       while (i < distsLeft.size()) {  
+                               if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  }
+                               else { break; }
+                               i++;
+                       }
+                       
+                       //add in ties from right
+                       i = numWanted;
+                       while (i < distsRight.size()) {  
+                               if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  }
+                               else { break; }
+                               i++;
+                       }
+               }
+               
 
-               //numWanted = dists.size();
 
                //cout << numWanted << endl;
                for (int i = 0; i < dists.size(); i++) {
 //                     cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
 
-                       if (thisTemplate[dists[i].index]->getName() != querySeq->getName()) {
+                       if ((thisTemplate[dists[i].index]->getName() != querySeq->getName()) && (dists[i].dist >= minSim)) {
                                Sequence* temp = new Sequence(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
                        
                                seqsMatches.push_back(temp);
@@ -841,7 +859,7 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
                
                Sequence* seqsMatch;  
                
-               Dist* distcalculator = new eachGapDistIgnoreNs();
+               Dist* distcalculator = new eachGapDist();
                int index = 0;
                int smallest = 1000000;
                
index 42e14416004efbea78edb36e19bf6a799d3ab523..40b3a10cb601d67bb17c442a9a972f42bbcfa104 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -39,7 +39,7 @@ class DeCalculator {
                DeCalculator() { m = MothurOut::getInstance(); }
                ~DeCalculator() {};
                
-               vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int);  //takes querySeq, a reference db, filteredRefDB, numWanted 
+               vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int, int);  //takes querySeq, a reference db, filteredRefDB, numWanted, minSim 
                Sequence* findClosest(Sequence*, vector<Sequence*>);
                set<int> getPos() {  return h;  }
                void setMask(string); 
index 52fd36f53f64af53858ff8002063fd77950ea14d..f834db1db319badab0ddb70cde9e54c7dbb3193f 100644 (file)
@@ -151,6 +151,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
                        temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
                        
 //                     cout << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << endl;
+
                        outputResults.push_back(temp);
                }
                
index 5ecb4ea7c5c430b34efc934b1e736915f363438c..a244ea42634fc9b31218aceaafd3d76f249d92e8 100644 (file)
@@ -426,21 +426,41 @@ float Slayer::snpAB(vector<snps> data) {
        }
 }
 /***********************************************************************/
-float Slayer::computePercentID(string queryFrag, string parent, int left, int right) {
+float Slayer::computePercentID(string queryAlign, string chimera, int left, int right) {
        try {
-               int total = 0;
-               int matches = 0;
-
+                               
+               int numIdentical = 0;
+               int countA = 0;
+               int countB = 0;
                for (int i = left; i <= right; i++) {
-                       total++;
-                       if (queryFrag[i] == parent[i]) {
-                               matches++;
+                       if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
+                               ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
+                       else {
+                               
+                               bool charA = false; bool charB = false;
+                               if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
+                               if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
+                               
+                               if (charA || charB) {
+                                       
+                                       if (charA) { countA++; }
+                                       if (charB) { countB++; }
+                                       
+                                       if (queryAlign[i] == chimera[i]) {
+                                               numIdentical++;
+                                       }
+                               }
                        }
                }
-
-               float percentID =( matches/(float)total) * 100;
                
-               return percentID;
+               float numBases = (countA + countB) /(float) 2;
+               
+               if (numBases == 0) { return 0; }
+               
+               float percentIdentical = (numIdentical/(float)numBases) * 100;
+               
+               return percentIdentical;
+               
        }
        catch(exception& e) {
                m->errorOut(e, "Slayer", "computePercentID");