From 81276c241b984898f8d30ad123c00592ee6db7b8 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 22 Feb 2010 17:22:22 +0000 Subject: [PATCH] chimeras, fix to sabundvector and sharedsabundvector that caused getRabundVector to step out of bounds --- averagelinkage.cpp | 7 - blastdb.cpp | 5 +- ccode.cpp | 22 ++-- chimera.cpp | 18 ++- chimera.h | 8 +- chimeraseqscommand.cpp | 11 +- chimeraslayer.cpp | 292 +++++++++++++++++++++++++++-------------- chimeraslayer.h | 9 +- cluster.cpp | 4 + decalc.cpp | 142 +++++++++++++++++--- decalc.h | 3 +- maligner.cpp | 124 ++++++++++++++--- maligner.h | 7 +- mothur.cpp | 9 ++ pintail.cpp | 93 ++++++++----- pintail.h | 1 + removeseqscommand.cpp | 64 ++++++--- removeseqscommand.h | 2 +- sabundvector.cpp | 6 +- sharedsabundvector.cpp | 4 +- 20 files changed, 596 insertions(+), 235 deletions(-) diff --git a/averagelinkage.cpp b/averagelinkage.cpp index db2c51e..7522d2a 100644 --- a/averagelinkage.cpp +++ b/averagelinkage.cpp @@ -38,15 +38,8 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) { saveCol = smallCol; } - float oldColCell = colCell->dist; - colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin; - //warn user if merge with value above cutoff produces a value below cutoff - if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) { - mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine(); - } - return(true); } catch(exception& e) { diff --git a/blastdb.cpp b/blastdb.cpp index dff1074..396a6b4 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -118,7 +118,7 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { system(blastCommand.c_str()); ifstream m8FileHandle; - openInputFile(blastFileName, m8FileHandle); + openInputFile(blastFileName, m8FileHandle, "no error"); string dummy; int templateAccession; @@ -132,9 +132,10 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { gobble(m8FileHandle); topMatches.push_back(templateAccession); +//cout << templateAccession << endl; } m8FileHandle.close(); - +//cout << "\n\n" ; return topMatches; } catch(exception& e) { diff --git a/ccode.cpp b/ccode.cpp index bfcd44d..326af7c 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -48,13 +48,10 @@ void Ccode::print(ostream& out) { out2 << it->first << '\t' << it->second << endl; } out2.close(); - - out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; for (int j = 0; j < closest.size(); j++) { - out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl; + out << closest[j].seq->getName() << '\t' << closest[j].dist << endl; } out << endl << endl; @@ -72,14 +69,12 @@ void Ccode::print(ostream& out) { out << endl << "Window\tStartPos\tEndPos" << endl; it = trim.begin(); - for (int k = 0; k < windows.size()-1; k++) { out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl; } out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl; out << endl; - out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; for (int k = 0; k < windows.size(); k++) { float ds = averageQuery[k] / averageRef[k]; @@ -94,7 +89,7 @@ void Ccode::print(ostream& out) { //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */ bool results = false; - + //confidence limit, t - Student, anova out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; @@ -118,6 +113,10 @@ void Ccode::print(ostream& out) { if (results) { mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); } + + //free memory + for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } + } catch(exception& e) { @@ -176,7 +175,7 @@ int Ccode::getChimeras(Sequence* query) { for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); } temp.push_back(query); - createFilter(temp); + createFilter(temp, 0.5); for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]); } @@ -221,9 +220,6 @@ int Ccode::getChimeras(Sequence* query) { determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. - //free memory - for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } - return 0; } catch(exception& e) { @@ -525,6 +521,10 @@ vector Ccode::findClosest(Sequence* q, int numWanted) { } sort(topMatches.begin(), topMatches.end(), compareSeqDist); + + for (int i = numWanted; i < topMatches.size(); i++) { delete topMatches[i].seq; } + + topMatches.resize(numWanted); return topMatches; diff --git a/chimera.cpp b/chimera.cpp index bf16de4..9ad287b 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -11,10 +11,10 @@ //*************************************************************************************************************** //this is a vertical soft filter -void Chimera::createFilter(vector seqs) { +string Chimera::createFilter(vector seqs, float t) { try { filterString = ""; - int threshold = int (0.5 * seqs.size()); + int threshold = int (t * seqs.size()); //cout << "threshhold = " << threshold << endl; vector gaps; gaps.resize(seqs[0]->getAligned().length(), 0); @@ -53,6 +53,7 @@ void Chimera::createFilter(vector seqs) { //cout << "filter = " << filterString << endl; mothurOut("Filter removed " + toString(numColRemoved) + " columns."); mothurOutEndLine(); + return filterString; } catch(exception& e) { errorOut(e, "Chimera", "createFilter"); @@ -60,18 +61,25 @@ void Chimera::createFilter(vector seqs) { } } //*************************************************************************************************************** -void Chimera::runFilter(Sequence* seq) { +map Chimera::runFilter(Sequence* seq) { try { - + map maskMap; string seqAligned = seq->getAligned(); string newAligned = ""; + int count = 0; for (int j = 0; j < seqAligned.length(); j++) { //if this spot is a gap - if (filterString[j] == '1') { newAligned += seqAligned[j]; } + if (filterString[j] == '1') { + newAligned += seqAligned[j]; + maskMap[count] = j; + count++; + } } seq->setAligned(newAligned); + + return maskMap; } catch(exception& e) { errorOut(e, "Chimera", "runFilter"); diff --git a/chimera.h b/chimera.h index 4161573..da908ce 100644 --- a/chimera.h +++ b/chimera.h @@ -45,6 +45,7 @@ struct results { int nastRegionStart; int nastRegionEnd; string parent; + string parentAligned; float queryToParent; float queryToParentLocal; float divR; @@ -53,6 +54,7 @@ struct results { struct SeqDist { Sequence* seq; float dist; + int index; }; //******************************************************************************************************************** //sorts lowest to highest @@ -88,7 +90,7 @@ class Chimera { Chimera(){}; Chimera(string); - Chimera(string, bool); + Chimera(string, bool, string); Chimera(string, string); virtual ~Chimera(){}; virtual void setFilter(bool f) { filter = f; } @@ -119,8 +121,8 @@ class Chimera { virtual vector readSeqs(string); virtual vector< vector > readQuantiles(); virtual void setMask(string); - virtual void runFilter(Sequence*); - virtual void createFilter(vector); + virtual map runFilter(Sequence*); + virtual string createFilter(vector, float); virtual void printHeader(ostream&){}; virtual int getChimeras(Sequence*){ return 0; } diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 13adb52..2ec6ea9 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -180,7 +180,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; } realign = isTrue(temp); - search = validParameter.validFile(parameters, "search", false); if (search == "not found") { temp = "distance"; } + search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; } temp = validParameter.validFile(parameters, "iters", false); if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; } @@ -198,7 +198,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else if (temp == "not found") { temp = "20"; } convert(temp, numwanted); - + if ((search != "distance") && (search != "blast") && (search != "kmer")) { mothurOut(search + " is not a valid search."); mothurOutEndLine(); abort = true; } if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true; } @@ -239,7 +239,7 @@ void ChimeraSeqsCommand::help(){ mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"); mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"); mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); - mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance. -used only by chimeraslayer. \n"); + mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. -used only by chimeraslayer. \n"); mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false. -used only by chimeraslayer. \n"); mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); mothurOut("Details for each method: \n"); @@ -282,7 +282,7 @@ int ChimeraSeqsCommand::execute(){ else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); } else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); } else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); } - else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign); } + else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign, fastafile); } else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options @@ -418,7 +418,8 @@ int ChimeraSeqsCommand::execute(){ // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; } appendOutputFiles(tempHeader, outputFileName); - remove(tempHeader.c_str()); + remove(outputFileName.c_str()); + rename(tempHeader.c_str(), outputFileName.c_str()); for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 8a0cbd0..7690328 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -9,16 +9,113 @@ #include "chimeraslayer.h" #include "chimerarealigner.h" +#include "kmerdb.hpp" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) { decalc = new DeCalculator(); } +ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) { + decalc = new DeCalculator(); +} //*************************************************************************************************************** -ChimeraSlayer::~ChimeraSlayer() { delete decalc; } +void ChimeraSlayer::doPrep() { + try { + + string kmerDBNameLeft; + string kmerDBNameRight; + + //generate the kmerdb to pass to maligner + if (searchMethod == "kmer") { + //leftside + string leftTemplateFileName = "left." + templateFileName; + databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); + kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestLeft(kmerDBNameLeft.c_str()); + + if(!kmerFileTestLeft){ + + for (int i = 0; i < templateSeqs.size(); i++) { + string leftFrag = templateSeqs[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(templateSeqs[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); + } + databaseLeft->generateDB(); + + }else { + databaseLeft->readKmerDB(kmerFileTestLeft); + } + kmerFileTestLeft.close(); + + databaseLeft->setNumSeqs(templateSeqs.size()); + + //rightside + string rightTemplateFileName = "right." + templateFileName; + databaseRight = new KmerDB(rightTemplateFileName, kmerSize); + kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestRight(kmerDBNameRight.c_str()); + + if(!kmerFileTestRight){ + + for (int i = 0; i < templateSeqs.size(); i++) { + string rightFrag = templateSeqs[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(templateSeqs[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + + }else { + databaseRight->readKmerDB(kmerFileTestRight); + } + kmerFileTestRight.close(); + + databaseRight->setNumSeqs(templateSeqs.size()); + + } + + int start = time(NULL); + //filter the sequences + //read in all query seqs + ifstream in; + openInputFile(fastafile, in); + + vector tempQuerySeqs; + while(!in.eof()){ + Sequence* s = new Sequence(in); + gobble(in); + + if (s->getName() != "") { tempQuerySeqs.push_back(s); } + } + in.close(); + + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + + createFilter(temp, 0.0); //just removed columns where all seqs have a gap + + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } + + mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to filter."); mothurOutEndLine(); + + } + catch(exception& e) { + errorOut(e, "ChimeraSlayer", "doprep"); + exit(1); + } +} +//*************************************************************************************************************** +ChimeraSlayer::~ChimeraSlayer() { delete decalc; if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } } //*************************************************************************************************************** void ChimeraSlayer::printHeader(ostream& out) { mothurOutEndLine(); - mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); + mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results."); mothurOutEndLine(); + + out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; } //*************************************************************************************************************** void ChimeraSlayer::print(ostream& out) { @@ -35,7 +132,7 @@ void ChimeraSlayer::print(ostream& out) { mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine(); } } - out << querySeq->getName() << "\tyes" << endl; + printBlock(chimeraResults[0], out); out << endl; }else { out << querySeq->getName() << "\tno" << endl; } @@ -50,114 +147,102 @@ void ChimeraSlayer::print(ostream& out) { int ChimeraSlayer::getChimeras(Sequence* query) { try { chimeraFlags = "no"; - querySeq = query; - for (int i = 0; i < query->getAligned().length(); i++) { - spotMap[i] = i; - } + //filter query + spotMap = runFilter(query); + + querySeq = query; //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod); + maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); slayer = new Slayer(window, increment, minSim, divR, iters, minSNP); string chimeraFlag = maligner->getResults(query, decalc); vector Results = maligner->getOutput(); - - //realign query to parents to improve slayers detection rate??? + + //found in testing realigning only made things worse if (realign) { ChimeraReAligner realigner(templateSeqs, match, misMatch); realigner.reAlign(query, Results); } - //if (chimeraFlag == "yes") { - - //get sequence that were given from maligner results - vector seqs; - map removeDups; - map::iterator itDup; - for (int j = 0; j < Results.size(); j++) { - float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; - //only add if you are not a duplicate - itDup = removeDups.find(Results[j].parent); - if (itDup == removeDups.end()) { //this is not duplicate - removeDups[Results[j].parent] = dist; - }else if (dist > itDup->second) { //is this a stronger number for this parent - removeDups[Results[j].parent] = dist; - } - } - - for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { - Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template - - SeqDist member; - member.seq = seq; - member.dist = itDup->second; + if (chimeraFlag == "yes") { - seqs.push_back(member); - } - - //limit number of parents to explore - default 3 - if (Results.size() > parents) { - //sort by distance - sort(seqs.begin(), seqs.end(), compareSeqDist); - //prioritize larger more similiar sequence fragments - reverse(seqs.begin(), seqs.end()); + //get sequence that were given from maligner results + vector seqs; + map removeDups; + map::iterator itDup; + map parentNameSeq; + map::iterator itSeq; + for (int j = 0; j < Results.size(); j++) { + float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; + //only add if you are not a duplicate + itDup = removeDups.find(Results[j].parent); + if (itDup == removeDups.end()) { //this is not duplicate + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + }else if (dist > itDup->second) { //is this a stronger number for this parent + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + } + } - for (int k = seqs.size()-1; k > (parents-1); k--) { - delete seqs[k].seq; - seqs.pop_back(); + for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { + //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + itSeq = parentNameSeq.find(itDup->first); +//cout << itDup->first << itSeq->second << endl; + Sequence* seq = new Sequence(itDup->first, itSeq->second); + + SeqDist member; + member.seq = seq; + member.dist = itDup->second; + + seqs.push_back(member); } - } - - //put seqs into vector to send to slayer - vector seqsForSlayer; - for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } - //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl; -//ofstream out; -//string name = querySeqs[i]->getName(); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(0, name.find_last_of("|")); -//cout << name << endl; -//string filename = toString(i+1) + ".seqsforslayer"; -//openOutputFile(filename, out); -//cout << querySeqs[i]->getName() << endl; -//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out); } -//cout << endl; -//out.close(); -//filename = toString(i+1) + ".fasta"; -//openOutputFile(filename, out); -//querySeqs[i]->printSequence(out); -//out.close(); - - - //mask then send to slayer... - if (seqMask != "") { - decalc->setMask(seqMask); + //limit number of parents to explore - default 3 + if (Results.size() > parents) { + //sort by distance + sort(seqs.begin(), seqs.end(), compareSeqDist); + //prioritize larger more similiar sequence fragments + reverse(seqs.begin(), seqs.end()); + + for (int k = seqs.size()-1; k > (parents-1); k--) { + delete seqs[k].seq; + seqs.pop_back(); + } + } - //mask querys - decalc->runMask(query); + //put seqs into vector to send to slayer + vector seqsForSlayer; + for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } - //mask parents - for (int k = 0; k < seqsForSlayer.size(); k++) { - decalc->runMask(seqsForSlayer[k]); + //mask then send to slayer... + if (seqMask != "") { + decalc->setMask(seqMask); + + //mask querys + decalc->runMask(query); + + //mask parents + for (int k = 0; k < seqsForSlayer.size(); k++) { + decalc->runMask(seqsForSlayer[k]); + } + + spotMap = decalc->getMaskMap(); } - spotMap = decalc->getMaskMap(); + //send to slayer + chimeraFlags = slayer->getResults(query, seqsForSlayer); + chimeraResults = slayer->getOutput(); + + //free memory + for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } } - //send to slayer - chimeraFlags = slayer->getResults(query, seqsForSlayer); - chimeraResults = slayer->getOutput(); - - //free memory - for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } - //} - + delete maligner; + delete slayer; + return 0; } catch(exception& e) { @@ -168,28 +253,31 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //*************************************************************************************************************** void ChimeraSlayer::printBlock(data_struct data, ostream& out){ try { + //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; + + out << querySeq->getName() << '\t'; + out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t'; + //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; + //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; + + out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t'; + out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t'; - out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl; - out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; - out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; + out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; - out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl; - out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl; + //out << "Similarity of parents: " << data.ab << endl; + //out << "Similarity of query to parentA: " << data.qa << endl; + //out << "Similarity of query to parentB: " << data.qb << endl; - out << "Similarity of parents: " << data.ab << endl; - out << "Similarity of query to parentA: " << data.qa << endl; - out << "Similarity of query to parentB: " << data.qb << endl; - out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl; - out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl; //out << "Per_id(QL,A): " << data.qla << endl; //out << "Per_id(QL,B): " << data.qlb << endl; //out << "Per_id(QR,A): " << data.qra << endl; //out << "Per_id(QR,B): " << data.qrb << endl; - out << "DeltaL: " << (data.qla - data.qlb) << endl; - out << "DeltaR: " << (data.qra - data.qrb) << endl; + //out << "DeltaL: " << (data.qla - data.qlb) << endl; + //out << "DeltaR: " << (data.qra - data.qrb) << endl; } catch(exception& e) { diff --git a/chimeraslayer.h b/chimeraslayer.h index f8006c4..bf9cf7f 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -23,12 +23,13 @@ class ChimeraSlayer : public Chimera { public: - ChimeraSlayer(string, bool); + ChimeraSlayer(string, bool, string); ~ChimeraSlayer(); int getChimeras(Sequence*); void print(ostream&); void printHeader(ostream&); + void doPrep(); private: Sequence* querySeq; @@ -36,13 +37,15 @@ class ChimeraSlayer : public Chimera { Maligner* maligner; Slayer* slayer; map spotMap; + Database* databaseRight; + Database* databaseLeft; vector chimeraResults; - string chimeraFlags, searchMethod; - string fastafile; + string chimeraFlags, searchMethod, fastafile; bool realign; void printBlock(data_struct, ostream&); + }; /************************************************************************/ diff --git a/cluster.cpp b/cluster.cpp index 00d8083..931b44d 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -216,6 +216,10 @@ void Cluster::update(){ for (int i=nColCells-1;i>=0;i--) { if (found[i] == 0) { removeCell(colCells[i], -1, i); +cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl; + if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) { + mothurOut("Warning: merging " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine(); + } } } } diff --git a/decalc.cpp b/decalc.cpp index 61de7c8..c5f0c26 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -11,6 +11,7 @@ #include "chimera.h" #include "dist.h" #include "eachgapdist.h" +#include "ignoregaps.h" //*************************************************************************************************************** @@ -680,11 +681,13 @@ float DeCalculator::getCoef(vector obs, vector qav) { } //*************************************************************************************************************** //gets closest matches to each end, since chimeras will most likely have different parents on each end -vector DeCalculator::findClosest(Sequence* querySeq, vector db, int numWanted) { +vector DeCalculator::findClosest(Sequence* querySeq, vector db, int& numWanted, vector& indexes) { try { + indexes.clear(); vector seqsMatches; - vector dists; + vector distsLeft; + vector distsRight; Dist* distcalculator = new eachGapDist(); @@ -696,12 +699,19 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetAligned(); //left side + bool foundFirstBase = false; int baseCount = 0; int leftSpot = 0; + int firstBaseSpot = 0; for (int i = 0; i < queryAligned.length(); i++) { - leftQuery += queryAligned[i]; //if you are a base - if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) { baseCount++; } + if (isalpha(queryAligned[i])) { + baseCount++; + if (!foundFirstBase) { foundFirstBase = true; firstBaseSpot = i; } + } + + //eliminate opening .'s + if (foundFirstBase) { leftQuery += queryAligned[i]; } //if you have 1/3 if (baseCount >= numBases) { leftSpot = i; break; } //first 1/3 } @@ -711,21 +721,31 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector= numBases) { rightSpot = i; break; } //last 1/3 } - rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end + + //trim end + //find last position in query that is a non gap character + int lastBaseSpot = queryAligned.length()-1; + for (int j = queryAligned.length()-1; j >= 0; j--) { + if (isalpha(queryAligned[j])) { + lastBaseSpot = j; + break; + } + } + rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end Sequence queryLeft(querySeq->getName(), leftQuery); Sequence queryRight(querySeq->getName(), rightQuery); -//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl; +//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl; //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl; for(int j = 0; j < db.size(); j++){ string dbAligned = db[j]->getAligned(); - string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence - string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence + string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence + string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence Sequence dbLeft(db[j]->getName(), leftDB); Sequence dbRight(db[j]->getName(), rightDB); @@ -736,22 +756,78 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorcalcDist(queryRight, dbRight); float distRight = distcalculator->getDist(); - float dist = min(distLeft, distRight); //keep smallest distance + SeqDist subjectLeft; + subjectLeft.seq = db[j]; + subjectLeft.dist = distLeft; + subjectLeft.index = j; + + distsLeft.push_back(subjectLeft); - SeqDist subject; - subject.seq = db[j]; - subject.dist = dist; + SeqDist subjectRight; + subjectRight.seq = db[j]; + subjectRight.dist = distRight; + subjectRight.index = j; - dists.push_back(subject); + distsRight.push_back(subjectRight); + } delete distcalculator; - sort(dists.begin(), dists.end(), compareSeqDist); + //sort by smallest distance + sort(distsRight.begin(), distsRight.end(), compareSeqDist); + sort(distsLeft.begin(), distsLeft.end(), compareSeqDist); + + //merge results + map seen; + map::iterator it; + vector dists; + float lastRight = distsRight[0].dist; + float lastLeft = distsLeft[0].dist; + int lasti = 0; + for (int i = 0; i < distsLeft.size(); i++) { + //add left if you havent already + it = seen.find(distsLeft[i].seq->getName()); + if (it == seen.end()) { + dists.push_back(distsLeft[i]); + seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName(); + lastLeft = distsLeft[i].dist; + } + + //add right if you havent already + it = seen.find(distsRight[i].seq->getName()); + if (it == seen.end()) { + dists.push_back(distsRight[i]); + seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName(); + lastRight = distsRight[i].dist; + } + + if (dists.size() > numWanted) { lasti = i; break; } //you have enough results + } + + //add in dups + lasti++; + int i = lasti; + while (i < distsLeft.size()) { + if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); numWanted++; } + else { break; } + i++; + } + + i = lasti; + while (i < distsRight.size()) { + if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); numWanted++; } + else { break; } + i++; + } + +//cout << numWanted << endl; for (int i = 0; i < numWanted; i++) { +//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl; Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. seqsMatches.push_back(temp); + indexes.push_back(dists[i].index); } return seqsMatches; @@ -761,6 +837,38 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector db) { + try { + + Sequence* seqsMatch; + + Dist* distcalculator = new eachGapDist(); + int index = 0; + int smallest = 1000000; + + for(int j = 0; j < db.size(); j++){ + + distcalculator->calcDist(*querySeq, *db[j]); + float dist = distcalculator->getDist(); + + if (dist < smallest) { + smallest = dist; + index = j; + } + } + + delete distcalculator; + + seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. + + return seqsMatch; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findClosest"); + exit(1); + } +} /***************************************************************************************************************/ map DeCalculator::trimSeqs(Sequence* query, vector topMatches) { try { @@ -838,10 +946,10 @@ map DeCalculator::trimSeqs(Sequence* query, vector topMatch //check to make sure that is not whole seq if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); } -//cout << "front = " << frontPos << " rear = " << rearPos << endl; +//cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl; //trim query string newAligned = query->getAligned(); - newAligned = newAligned.substr(frontPos, (rearPos-frontPos)); + newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1)); query->setAligned(newAligned); //trim topMatches diff --git a/decalc.h b/decalc.h index 0119936..48a72bb 100644 --- a/decalc.h +++ b/decalc.h @@ -39,7 +39,8 @@ class DeCalculator { DeCalculator() {}; ~DeCalculator() {}; - vector findClosest(Sequence*, vector, int); //takes querySeq, a reference db and numWanted - returns indexes to closest seqs in db + vector findClosest(Sequence*, vector, int&, vector&); //takes querySeq, a reference db, numWanted and indexes + Sequence* findClosest(Sequence*, vector); set getPos() { return h; } void setMask(string); void setAlignmentLength(int l) { alignLength = l; } diff --git a/maligner.cpp b/maligner.cpp index b393194..91b0fa7 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -8,12 +8,12 @@ */ #include "maligner.h" -#include "database.hpp" +#include "kmerdb.hpp" #include "blastdb.hpp" /***********************************************************************/ -Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) : - db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {} +Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) : + db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) {} /***********************************************************************/ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { try { @@ -25,15 +25,17 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { string chimera; - if (searchMethod != "blast") { + if (searchMethod == "distance") { //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate - refSeqs = decalc->findClosest(query, db, numWanted); - }else{ - refSeqs = getBlastSeqs(query, numWanted); - } + refSeqs = decalc->findClosest(query, db, numWanted, indexes); + }else if (searchMethod == "blast") { + refSeqs = getBlastSeqs(query, numWanted); //fills indexes + }else if (searchMethod == "kmer") { + refSeqs = getKmerSeqs(query, numWanted); //fills indexes + }else { mothurOut("not valid search."); exit(1); } //should never get here refSeqs = minCoverageFilter(refSeqs); - + if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -41,13 +43,14 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { } int chimeraPenalty = computeChimeraPenalty(); - + //fills outputResults chimera = chimeraMaligner(chimeraPenalty, decalc); - + //free memory delete query; + for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } return chimera; @@ -65,31 +68,31 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { //trims seqs to first non gap char in all seqs and last non gap char in all seqs spotMap = decalc->trimSeqs(query, refSeqs); - + vector temp = refSeqs; temp.push_back(query); verticalFilter(temp); +//for (int i = 0; i < temp.size(); i++) { cout << temp[i]->getName() << '\n' << temp[i]->getAligned() << endl; } return "no"; vector< vector > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes fillScoreMatrix(matrix, refSeqs, chimeraPenalty); - + vector path = extractHighestPath(matrix); vector trace = mapTraceRegionsToAlignment(path, refSeqs); - + if (trace.size() > 1) { chimera = "yes"; } else { chimera = "no"; } int traceStart = path[0].col; - int traceEnd = path[path.size()-1].col; - + int traceEnd = path[path.size()-1].col; string queryInRange = query->getAligned(); queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1)); string chimeraSeq = constructChimericSeq(trace, refSeqs); - + percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq); //save output results @@ -101,6 +104,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { results temp; temp.parent = refSeqs[seqIndex]->getName(); + temp.parentAligned = db[indexes[seqIndex]]->getAligned(); temp.nastRegionStart = spotMap[regionStart]; temp.nastRegionEnd = spotMap[regionEnd]; temp.regionStart = regionStart; @@ -481,6 +485,8 @@ float Maligner::computePercentID(string queryAlign, string chimera) { //*************************************************************************************************************** vector Maligner::getBlastSeqs(Sequence* q, int num) { try { + indexes.clear(); + vector refResults; //generate blastdb Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty); for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); } @@ -498,6 +504,82 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { vector tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1); vector tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1); + //if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1))) { mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); mothurOutEndLine(); return refResults; } + + vector smaller; + vector larger; + + if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; larger = tempIndexesLeft; } + else { smaller = tempIndexesLeft; larger = tempIndexesRight; } + + //merge results + map seen; + map::iterator it; + + vector mergedResults; + for (int i = 0; i < smaller.size(); i++) { + //add left if you havent already + it = seen.find(smaller[i]); + if (it == seen.end()) { + mergedResults.push_back(smaller[i]); + seen[smaller[i]] = smaller[i]; + } + + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + } + } + + for (int i = smaller.size(); i < larger.size(); i++) { + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + } + } + + if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); } +//cout << q->getName() << endl; + for (int i = 0; i < numWanted; i++) { +//cout << db[mergedResults[i]]->getName() << endl; + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + indexes.push_back(mergedResults[i]); + } +//cout << mergedResults[i] << endl; + } +//cout << endl; + delete queryRight; + delete queryLeft; + delete database; + + return refResults; + } + catch(exception& e) { + errorOut(e, "Maligner", "getBlastSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +vector Maligner::getKmerSeqs(Sequence* q, int num) { + try { + indexes.clear(); + + //get parts of query + string queryUnAligned = q->getUnaligned(); + string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence + string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence + + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); + Sequence* queryRight = new Sequence(q->getName(), rightQuery); + + vector tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted); + vector tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted); + //merge results map seen; map::iterator it; @@ -519,16 +601,17 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { } } - +//cout << q->getName() << endl; vector refResults; for (int i = 0; i < numWanted; i++) { +//cout << db[mergedResults[i]]->getName() << endl; Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); refResults.push_back(temp); + indexes.push_back(mergedResults[i]); } - +//cout << endl; delete queryRight; delete queryLeft; - delete database; return refResults; } @@ -537,6 +620,5 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { exit(1); } } - //*************************************************************************************************************** diff --git a/maligner.h b/maligner.h index abe18a3..13aa917 100644 --- a/maligner.h +++ b/maligner.h @@ -11,6 +11,7 @@ #include "decalc.h" #include "chimera.h" +#include "database.hpp" /***********************************************************************/ //This class was modeled after the chimeraMaligner written by the Broad Institute @@ -19,7 +20,7 @@ class Maligner { public: - Maligner(vector, int, int, int, float, int, int, string); + Maligner(vector, int, int, int, float, int, int, string, Database*, Database*); ~Maligner() {}; string getResults(Sequence*, DeCalculator*); @@ -35,7 +36,10 @@ class Maligner { string searchMethod; float minDivR, percentIdenticalQueryChimera; vector outputResults; + vector indexes; //stores index into template seqs of the refSeqs, so we can return the whole sequence rather than the trimmed and filtered one map spotMap; + Database* databaseLeft; + Database* databaseRight; vector minCoverageFilter(vector); //removes top matches that do not have minimum coverage with query. int computeChimeraPenalty(); @@ -49,6 +53,7 @@ class Maligner { float computePercentID(string, string); string chimeraMaligner(int, DeCalculator*); vector getBlastSeqs(Sequence*, int); + vector getKmerSeqs(Sequence*, int); }; diff --git a/mothur.cpp b/mothur.cpp index 8203c57..7acf389 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -93,12 +93,21 @@ int main(int argc, char *argv[]){ input = argv[1]; if (input[0] == '#') { + mothurOutJustToLog("Script Mode"); + mothurOutEndLine(); mothurOutEndLine(); + mothur = new ScriptEngine(argv[0], argv[1]); }else{ + mothurOutJustToLog("Batch Mode"); + mothurOutEndLine(); mothurOutEndLine(); + mothur = new BatchEngine(argv[0], argv[1]); } } else{ + mothurOutJustToLog("Interactive Mode"); + mothurOutEndLine(); mothurOutEndLine(); + mothur = new InteractEngine(argv[0]); } diff --git a/pintail.cpp b/pintail.cpp index 74e1ce4..46da913 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -39,7 +39,8 @@ Pintail::~Pintail() { //*************************************************************************************************************** void Pintail::doPrep() { try { - + + mergedFilterString = ""; windowSizesTemplate.resize(templateSeqs.size(), window); quantiles.resize(100); //one for every percent mismatch quantilesMembers.resize(100); //one for every percent mismatch @@ -72,14 +73,15 @@ void Pintail::doPrep() { mothurOut("Done."); mothurOutEndLine(); }else { probabilityProfile = readFreq(); mothurOut("Done."); } mothurOutEndLine(); - - //quantiles are used to determine whether the de values found indicate a chimera - //if you have to calculate them, its time intensive because you are finding the de and deviation values for each - //combination of sequences in the template - if (quanfile != "") { quantiles = readQuantiles(); } - else { + + //make P into Q + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; + + bool reRead = false; + //create filter if needed for later + if (filter) { + reRead = true; - //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras if (seqMask != "") { //mask templates for (int i = 0; i < templateSeqs.size(); i++) { @@ -87,11 +89,44 @@ void Pintail::doPrep() { } } - if (filter) { - createFilter(templateSeqs); - for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } + //read in all query seqs + ifstream in; + openInputFile(fastafile, in); + + vector tempQuerySeqs; + while(!in.eof()){ + Sequence* s = new Sequence(in); + gobble(in); + + if (s->getName() != "") { tempQuerySeqs.push_back(s); } + } + in.close(); + + vector temp; + //merge query seqs and template seqs + temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + + mergedFilterString = createFilter(temp, 0.5); + + //reread template seqs + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + } + + + //quantiles are used to determine whether the de values found indicate a chimera + //if you have to calculate them, its time intensive because you are finding the de and deviation values for each + //combination of sequences in the template + if (quanfile != "") { + quantiles = readQuantiles(); + }else { + if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above. + reRead = true; + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); + } } - mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { @@ -155,15 +190,15 @@ void Pintail::doPrep() { } mothurOut("Done."); mothurOutEndLine(); - - //if mask reread template - if ((seqMask != "") || (filter)) { - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - templateSeqs = readSeqs(templateFileName); - } - } + if (reRead) { + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + templateSeqs.clear(); + templateSeqs = readSeqs(templateFileName); + } + + //free memory for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } @@ -217,24 +252,22 @@ int Pintail::getChimeras(Sequence* query) { querySeq = query; trimmed.clear(); windowSizes = window; - - //find pairs - bestfit = findPairs(query); - //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; - - //mask sequences if the user wants to + //find pairs has to be done before a mask + bestfit = findPairs(query); + + //if they mask if (seqMask != "") { decalc->runMask(query); decalc->runMask(bestfit); } - - if (filter) { + + if (filter) { //must be done after a mask runFilter(query); runFilter(bestfit); } + //trim seq decalc->trimSeqs(query, bestfit, trimmed); @@ -319,9 +352,7 @@ Sequence* Pintail::findPairs(Sequence* q) { Sequence* seqsMatches; - vector copy = decalc->findClosest(q, templateSeqs, 1); - seqsMatches = copy[0]; - + seqsMatches = decalc->findClosest(q, templateSeqs); return seqsMatches; } diff --git a/pintail.h b/pintail.h index ed52c3a..efb8ba6 100644 --- a/pintail.h +++ b/pintail.h @@ -65,6 +65,7 @@ class Pintail : public Chimera { vector< vector > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... vector< vector > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... set h; + string mergedFilterString; vector readFreq(); diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index a8f03e0..19a4bc0 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -22,7 +22,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir" }; + string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir", "dups" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -119,10 +119,17 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){ if (listfile == "not open") { abort = true; } else if (listfile == "not found") { listfile = ""; } + string usedDups = "true"; + string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } + dups = isTrue(temp); + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "")) { mothurOut("You must provide one of the following: fasta, name, group, alignreport or list."); mothurOutEndLine(); abort = true; } int okay = 2; if (outputDir != "") { okay++; } + if (usedDups != "") { okay++; } + + if ((usedDups != "") && (namefile == "")) { mothurOut("You may only use dups with the name option."); mothurOutEndLine(); abort = true; } if (parameters.size() > okay) { mothurOut("You may only enter one of the following: fasta, name, group, alignreport, or list."); mothurOutEndLine(); abort = true; } } @@ -139,7 +146,8 @@ void RemoveSeqsCommand::help(){ try { mothurOut("The remove.seqs command reads an .accnos file and one of the following file types: fasta, name, group, list or alignreport file.\n"); mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n"); - mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list and alignreport. You must provide accnos and one of the other parameters.\n"); + mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, alignreport and dups. You must provide accnos and one of the file parameters.\n"); + mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. If dups=true, then remove.seqs outputs a new .accnos file containing all the sequences removed. \n"); mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n"); mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"); mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); @@ -288,7 +296,12 @@ void RemoveSeqsCommand::readName(){ try { if (outputDir == "") { outputDir += hasPath(namefile); } string outputFileName = getRootName(namefile) + "pick" + getExtension(namefile); + string outputFileName2 = getRootName(namefile) + "dups.accnos"; + ofstream out2; + if (dups) { openOutputFile(outputFileName2, out2); } + bool wroteDups = false; + ofstream out; openOutputFile(outputFileName, out); @@ -322,38 +335,47 @@ void RemoveSeqsCommand::readName(){ } } - //if the name in the first column is in the set then print it and any other names in second column also in set - if (names.count(firstCol) == 0) { - - wroteSomething = true; - - out << firstCol << '\t'; - - //you know you have at least one valid second since first column is valid - for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } - out << validSecond[validSecond.size()-1] << endl; - - //make first name in set you come to first column and then add the remaining names to second column + if ((dups) && (validSecond.size() != parsedNames.size())) { + wroteDups = true; + for (int i = 0; i < parsedNames.size(); i++) { out2 << parsedNames[i] << endl; } }else { - - //you want part of this row - if (validSecond.size() != 0) { - + //if the name in the first column is in the set then print it and any other names in second column also in set + if (names.count(firstCol) == 0) { + wroteSomething = true; - out << validSecond[0] << '\t'; - + out << firstCol << '\t'; + //you know you have at least one valid second since first column is valid for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } out << validSecond[validSecond.size()-1] << endl; + + //make first name in set you come to first column and then add the remaining names to second column + }else { + + //you want part of this row + if (validSecond.size() != 0) { + + wroteSomething = true; + + out << validSecond[0] << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + } } } - gobble(in); } in.close(); out.close(); + if (dups) { out2.close(); } + if (wroteDups == false) { + remove(outputFileName2.c_str()); + } + if (wroteSomething == false) { mothurOut("Your file contains only sequences from the .accnos file."); mothurOutEndLine(); remove(outputFileName.c_str()); diff --git a/removeseqscommand.h b/removeseqscommand.h index 1dcb0fc..647cb76 100644 --- a/removeseqscommand.h +++ b/removeseqscommand.h @@ -24,7 +24,7 @@ class RemoveSeqsCommand : public Command { private: set names; string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, outputDir; - bool abort; + bool abort, dups; void readFasta(); void readName(); diff --git a/sabundvector.cpp b/sabundvector.cpp index b70c642..2bb8d69 100644 --- a/sabundvector.cpp +++ b/sabundvector.cpp @@ -60,8 +60,10 @@ SAbundVector::SAbundVector(ifstream& f): DataVector(), maxRank(0), numBins(0), n for(int i=1;i<=hold;i++){ f >> inputData; + set(i, inputData); } + } catch(exception& e) { errorOut(e, "SAbundVector", "SAbundVector"); @@ -192,13 +194,13 @@ RAbundVector SAbundVector::getRAbundVector(){ try { RAbundVector rav; - for(int i=1;i<=data.size();i++){ + for(int i=1;i < data.size();i++){ for(int j=0;j