From: westcott Date: Wed, 17 Feb 2010 13:30:52 +0000 (+0000) Subject: chimeras X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=e72551c9cc5542e6a354f0f3e415fea261421d72 chimeras --- diff --git a/blastdb.cpp b/blastdb.cpp index b139f3e..dff1074 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -72,6 +72,23 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { } m8FileHandle.close(); + string root = dbFileName; + string temp = dbFileName + ".nsq"; + remove(temp.c_str()); + temp = dbFileName + ".nsi"; + remove(temp.c_str()); + + temp = dbFileName + ".nsd"; + remove(temp.c_str()); + + temp = dbFileName + ".nin"; + remove(temp.c_str()); + + temp = dbFileName + ".nhr"; + remove(temp.c_str()); + + + return topMatches; } catch(exception& e) { diff --git a/chimera.h b/chimera.h index 4d59012..4161573 100644 --- a/chimera.h +++ b/chimera.h @@ -88,6 +88,7 @@ class Chimera { Chimera(){}; Chimera(string); + Chimera(string, bool); Chimera(string, string); virtual ~Chimera(){}; virtual void setFilter(bool f) { filter = f; } diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp index f45b9a8..fa9dd56 100644 --- a/chimerarealigner.cpp +++ b/chimerarealigner.cpp @@ -24,7 +24,7 @@ void ChimeraReAligner::reAlign(Sequence* query, vector parents) { string qAligned = query->getAligned(); string newQuery = ""; - + //sort parents by region start sort(parents.begin(), parents.end(), compareRegionStart); @@ -33,19 +33,16 @@ void ChimeraReAligner::reAlign(Sequence* query, vector parents) { int longest = 0; //take query and break apart into pieces using breakpoints given by results of parents for (int i = 0; i < parents.size(); i++) { - cout << parents[i].parent << '\t' << parents[i].nastRegionStart << '\t' << parents[i].nastRegionEnd << endl; - int length = parents[i].nastRegionEnd - parents[i].nastRegionStart; + int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1; string q = qAligned.substr(parents[i].nastRegionStart, length); - cout << "query = " << q << endl; Sequence* queryFrag = new Sequence(query->getName(), q); queryParts.push_back(queryFrag); - + Sequence* parent = getSequence(parents[i].parent); string p = parent->getAligned(); p = p.substr(parents[i].nastRegionStart, length); - cout << "parent = " << p << endl; parent->setAligned(p); parentParts.push_back(parent); @@ -69,7 +66,7 @@ void ChimeraReAligner::reAlign(Sequence* query, vector parents) { //make sure you don't cutoff end of query if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd); } - + //set query to new aligned string query->setAligned(newQuery); diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index e1eebf4..13adb52 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -27,7 +27,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", - "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir" }; + "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -177,6 +177,11 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; } convert(temp, parents); + temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; } + realign = isTrue(temp); + + search = validParameter.validFile(parameters, "search", false); if (search == "not found") { temp = "distance"; } + temp = validParameter.validFile(parameters, "iters", false); if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; } else if (temp == "not found") { temp = "1000"; } @@ -213,7 +218,7 @@ void ChimeraSeqsCommand::help(){ //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name" //mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n"); mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n"); - mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters.\n"); + mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters, search, realign.\n"); mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n"); mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n"); @@ -234,6 +239,8 @@ void ChimeraSeqsCommand::help(){ mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"); mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"); mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); + mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance. -used only by chimeraslayer. \n"); + mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false. -used only by chimeraslayer. \n"); mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); mothurOut("Details for each method: \n"); mothurOut("\tpintail: \n"); @@ -275,7 +282,7 @@ int ChimeraSeqsCommand::execute(){ else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); } else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); } else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); } - else if (method == "chimeraslayer") { chimera = new ChimeraSlayer("blast"); } + else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign); } else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index bac0285..3e437ec 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -40,8 +40,8 @@ private: void appendOutputFiles(string, string); bool abort; - string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir; - bool filter, correction, svg, printAll; + string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir, search; + bool filter, correction, svg, printAll, realign; int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs; float divR; Chimera* chimera; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 2cbd5d8..8a0cbd0 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -11,7 +11,7 @@ #include "chimerarealigner.h" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string mode) : searchMethod(mode) { decalc = new DeCalculator(); } +ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) { decalc = new DeCalculator(); } //*************************************************************************************************************** ChimeraSlayer::~ChimeraSlayer() { delete decalc; } //*************************************************************************************************************** @@ -63,10 +63,12 @@ int ChimeraSlayer::getChimeras(Sequence* query) { string chimeraFlag = maligner->getResults(query, decalc); vector Results = maligner->getOutput(); - //realign query to parents to improve slayers detection rate - ChimeraReAligner realigner(templateSeqs, match, misMatch); - realigner.reAlign(query, Results); -cout << query->getName() << '\n' << query->getAligned() << endl; + //realign query to parents to improve slayers detection rate??? + if (realign) { + ChimeraReAligner realigner(templateSeqs, match, misMatch); + realigner.reAlign(query, Results); + } + //if (chimeraFlag == "yes") { //get sequence that were given from maligner results diff --git a/chimeraslayer.h b/chimeraslayer.h index 4095b86..f8006c4 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -23,7 +23,7 @@ class ChimeraSlayer : public Chimera { public: - ChimeraSlayer(string); + ChimeraSlayer(string, bool); ~ChimeraSlayer(); int getChimeras(Sequence*); @@ -40,6 +40,7 @@ class ChimeraSlayer : public Chimera { vector chimeraResults; string chimeraFlags, searchMethod; string fastafile; + bool realign; void printBlock(data_struct, ostream&); }; diff --git a/decalc.cpp b/decalc.cpp index 911efe2..61de7c8 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -688,22 +688,48 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetUnaligned(); + int numBases = int(queryUnAligned.length() * 0.33); + + string leftQuery = ""; //first 1/3 of the sequence + string rightQuery = ""; //last 1/3 of the sequence string queryAligned = querySeq->getAligned(); - string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence - string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence + + //left side + int baseCount = 0; + int leftSpot = 0; + for (int i = 0; i < queryAligned.length(); i++) { + leftQuery += queryAligned[i]; + //if you are a base + if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) { baseCount++; } + //if you have 1/3 + if (baseCount >= numBases) { leftSpot = i; break; } //first 1/3 + } + + //right side - count through another 1/3, so you are at last third + baseCount = 0; + int rightSpot = 0; + for (int i = leftSpot; i < queryAligned.length(); i++) { + //if you are a base + if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) { baseCount++; } + //if you have 1/3 + if (baseCount >= numBases) { rightSpot = i; break; } //last 1/3 + } + rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end Sequence queryLeft(querySeq->getName(), leftQuery); Sequence queryRight(querySeq->getName(), rightQuery); - +//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl; +//cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl; for(int j = 0; j < db.size(); j++){ string dbAligned = db[j]->getAligned(); - string leftDB = dbAligned.substr(0, (dbAligned.length() / 3)); //first 1/3 of the sequence - string rightDB = dbAligned.substr(((dbAligned.length() / 3)*2)); //last 1/3 of the sequence + string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence + string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence Sequence dbLeft(db[j]->getName(), leftDB); Sequence dbRight(db[j]->getName(), rightDB); - + distcalculator->calcDist(queryLeft, dbLeft); float distLeft = distcalculator->getDist();