]> git.donarmstrong.com Git - mothur.git/commitdiff
chimeras
authorwestcott <westcott>
Wed, 17 Feb 2010 13:30:52 +0000 (13:30 +0000)
committerwestcott <westcott>
Wed, 17 Feb 2010 13:30:52 +0000 (13:30 +0000)
blastdb.cpp
chimera.h
chimerarealigner.cpp
chimeraseqscommand.cpp
chimeraseqscommand.h
chimeraslayer.cpp
chimeraslayer.h
decalc.cpp

index b139f3eb7a47bcc86f0920bdc0fc22bad368816a..dff1074e92a3d405419712c72c3528333a004814 100644 (file)
@@ -72,6 +72,23 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
                }
                m8FileHandle.close();
                
+               string root = dbFileName;
+               string temp = dbFileName + ".nsq";
+               remove(temp.c_str());   
+               temp = dbFileName + ".nsi";
+               remove(temp.c_str());
+               
+               temp = dbFileName + ".nsd";
+               remove(temp.c_str());   
+
+               temp = dbFileName + ".nin";
+               remove(temp.c_str());   
+
+               temp = dbFileName + ".nhr";
+               remove(temp.c_str());   
+       
+
+               
                return topMatches;
        }
        catch(exception& e) {
index 4d590123ee319768f694d8b8f3fd213ad055290e..41615733392988ceacec5531b908349157804ce5 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -88,6 +88,7 @@ class Chimera {
        
                Chimera(){};
                Chimera(string);
+               Chimera(string, bool);
                Chimera(string, string);
                virtual ~Chimera(){};
                virtual void setFilter(bool f)                  {       filter = f;                     }
index f45b9a87f6829b68c81c8b137758ec3f6a096022..fa9dd566c83014124fa302be6d5505102acdddac 100644 (file)
@@ -24,7 +24,7 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                        
                        string qAligned = query->getAligned();
                        string newQuery = "";
-                       
+               
                        //sort parents by region start
                        sort(parents.begin(), parents.end(), compareRegionStart);
 
@@ -33,19 +33,16 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                        int longest = 0;
                        //take query and break apart into pieces using breakpoints given by results of parents
                        for (int i = 0; i < parents.size(); i++) {
-       cout << parents[i].parent << '\t' << parents[i].nastRegionStart <<  '\t' << parents[i].nastRegionEnd    << endl;        
-                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart;
+                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
                                string q = qAligned.substr(parents[i].nastRegionStart, length);
-       cout << "query = " << q << endl;
                                Sequence* queryFrag = new Sequence(query->getName(), q);
                                
                                queryParts.push_back(queryFrag);
-                               
+                       
                                Sequence* parent = getSequence(parents[i].parent);
                                string p = parent->getAligned();
                
                                p = p.substr(parents[i].nastRegionStart, length);
-       cout << "parent = " << p << endl;               
                                parent->setAligned(p);
                                
                                parentParts.push_back(parent);
@@ -69,7 +66,7 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                        
                        //make sure you don't cutoff end of query 
                        if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd);  }
-                       
+               
                        //set query to new aligned string
                        query->setAligned(newQuery);
                        
index e1eebf400c052f7f7926e0029f6090ae0e93ede3..13adb52bbbbb2e18477964f7fda344fd1dc1bcf2 100644 (file)
@@ -27,7 +27,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                else {
                        //valid paramters for this command
                        string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", 
-                       "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir" };
+                       "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -177,6 +177,11 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        temp = validParameter.validFile(parameters, "parents", false);                  if (temp == "not found") { temp = "3"; }
                        convert(temp, parents); 
                        
+                       temp = validParameter.validFile(parameters, "realign", false);                  if (temp == "not found") { temp = "f"; }
+                       realign = isTrue(temp); 
+                       
+                       search = validParameter.validFile(parameters, "search", false);                 if (search == "not found") { temp = "distance"; }
+                       
                        temp = validParameter.validFile(parameters, "iters", false);    
                        if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }             
                        else if (temp == "not found") { temp = "1000"; }
@@ -213,7 +218,7 @@ void ChimeraSeqsCommand::help(){
                //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name"
                //mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n");
                mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
-               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters.\n");
+               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters, search, realign.\n");
                mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n");
                mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
                mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
@@ -234,6 +239,8 @@ void ChimeraSeqsCommand::help(){
                mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
                mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
                mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
+               mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance.  -used only by chimeraslayer. \n");
+               mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false.  -used only by chimeraslayer. \n");
                mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
                mothurOut("Details for each method: \n"); 
                mothurOut("\tpintail: \n"); 
@@ -275,7 +282,7 @@ int ChimeraSeqsCommand::execute(){
                else if (method == "pintail")                   {               chimera = new Pintail(fastafile, outputDir);                            }
                else if (method == "ccode")                             {               chimera = new Ccode(fastafile, outputDir);                                      }
                else if (method == "chimeracheck")              {               chimera = new ChimeraCheckRDP(fastafile, outputDir);            }
-               else if (method == "chimeraslayer")             {               chimera = new ChimeraSlayer("blast");                                           }
+               else if (method == "chimeraslayer")             {               chimera = new ChimeraSlayer(search, realign);                           }
                else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;          }
                
                //set user options
index bac02852744952477c97c0ea7dc284f56ff589bc..3e437ec33c5382b57b2364570a4a4caa094c97ea 100644 (file)
@@ -40,8 +40,8 @@ private:
        void appendOutputFiles(string, string); 
 
        bool abort;
-       string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir;
-       bool filter, correction, svg, printAll;
+       string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir, search;
+       bool filter, correction, svg, printAll, realign;
        int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs;
        float divR;
        Chimera* chimera;
index 2cbd5d8343f8b96614020d2be5f04250089b73ee..8a0cbd067cc6cbfb8ced9fbb3a3cb89ea6fdc62b 100644 (file)
@@ -11,7 +11,7 @@
 #include "chimerarealigner.h"
 
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string mode) : searchMethod(mode) {       decalc = new DeCalculator();      }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) {   decalc = new DeCalculator();      }
 //***************************************************************************************************************
 ChimeraSlayer::~ChimeraSlayer() {      delete decalc;   }
 //***************************************************************************************************************
@@ -63,10 +63,12 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                string chimeraFlag = maligner->getResults(query, decalc);
                vector<results> Results = maligner->getOutput();
 
-               //realign query to parents to improve slayers detection rate
-               ChimeraReAligner realigner(templateSeqs, match, misMatch);
-               realigner.reAlign(query, Results);
-cout << query->getName() << '\n' << query->getAligned() << endl;
+               //realign query to parents to improve slayers detection rate???
+               if (realign) {
+                       ChimeraReAligner realigner(templateSeqs, match, misMatch);
+                       realigner.reAlign(query, Results);
+               }
+
                        //if (chimeraFlag == "yes") {
                        
                //get sequence that were given from maligner results
index 4095b86a4b0855e31b05bbfd775504801cc4ed1f..f8006c41382919b311e8fd8b590f31e9b8bfaad1 100644 (file)
@@ -23,7 +23,7 @@
 class ChimeraSlayer : public Chimera {
        
        public:
-               ChimeraSlayer(string);  
+               ChimeraSlayer(string, bool);    
                ~ChimeraSlayer();
                
                int getChimeras(Sequence*);
@@ -40,6 +40,7 @@ class ChimeraSlayer : public Chimera {
                vector<data_struct>  chimeraResults;
                string chimeraFlags, searchMethod;
                string fastafile;
+               bool realign;
        
                void printBlock(data_struct, ostream&);
 };
index 911efe268b6cd71817c28c7a8c008f850ff63e3e..61de7c832dfbc7a16f8984b6bff127ebc68ba63f 100644 (file)
@@ -688,22 +688,48 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                
                Dist* distcalculator = new eachGapDist();
                
+               string queryUnAligned = querySeq->getUnaligned();
+               int numBases = int(queryUnAligned.length() * 0.33);
+               
+               string leftQuery = ""; //first 1/3 of the sequence
+               string rightQuery = ""; //last 1/3 of the sequence
                string queryAligned = querySeq->getAligned();
-               string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
-               string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
+               
+               //left side
+               int baseCount = 0;
+               int leftSpot = 0;
+               for (int i = 0; i < queryAligned.length(); i++) {
+                       leftQuery += queryAligned[i];
+                       //if you are a base
+                       if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {             baseCount++;    }
+                       //if you have 1/3
+                       if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
+               }
+               
+               //right side - count through another 1/3, so you are at last third
+               baseCount = 0;
+               int rightSpot = 0;
+               for (int i = leftSpot; i < queryAligned.length(); i++) {
+                       //if you are a base
+                       if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {             baseCount++;    }
+                       //if you have 1/3
+                       if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
+               }
+               rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end
                
                Sequence queryLeft(querySeq->getName(), leftQuery);
                Sequence queryRight(querySeq->getName(), rightQuery);
-                               
+//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl;
+//cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
                for(int j = 0; j < db.size(); j++){
                        
                        string dbAligned = db[j]->getAligned();
-                       string leftDB = dbAligned.substr(0, (dbAligned.length() / 3)); //first 1/3 of the sequence
-                       string rightDB = dbAligned.substr(((dbAligned.length() / 3)*2)); //last 1/3 of the sequence
+                       string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence
+                       string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence
                        
                        Sequence dbLeft(db[j]->getName(), leftDB);
                        Sequence dbRight(db[j]->getName(), rightDB);
-                       
+
                        distcalculator->calcDist(queryLeft, dbLeft);
                        float distLeft = distcalculator->getDist();