saveCol = smallCol;
}
- float oldColCell = colCell->dist;
-
colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
- //warn user if merge with value above cutoff produces a value below cutoff
- if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) {
- mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
- }
-
return(true);
}
catch(exception& e) {
system(blastCommand.c_str());
ifstream m8FileHandle;
- openInputFile(blastFileName, m8FileHandle);
+ openInputFile(blastFileName, m8FileHandle, "no error");
string dummy;
int templateAccession;
gobble(m8FileHandle);
topMatches.push_back(templateAccession);
+//cout << templateAccession << endl;
}
m8FileHandle.close();
-
+//cout << "\n\n" ;
return topMatches;
}
catch(exception& e) {
out2 << it->first << '\t' << it->second << endl;
}
out2.close();
-
- out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-
out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
for (int j = 0; j < closest.size(); j++) {
- out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
+ out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
}
out << endl << endl;
out << endl << "Window\tStartPos\tEndPos" << endl;
it = trim.begin();
-
for (int k = 0; k < windows.size()-1; k++) {
out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
}
out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
out << endl;
-
out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
for (int k = 0; k < windows.size(); k++) {
float ds = averageQuery[k] / averageRef[k];
//float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
bool results = false;
-
+
//confidence limit, t - Student, anova
out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
if (results) {
mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
}
+
+ //free memory
+ for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
+
}
catch(exception& e) {
for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); }
temp.push_back(query);
- createFilter(temp);
+ createFilter(temp, 0.5);
for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]); }
determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
- //free memory
- for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
-
return 0;
}
catch(exception& e) {
}
sort(topMatches.begin(), topMatches.end(), compareSeqDist);
+
+ for (int i = numWanted; i < topMatches.size(); i++) { delete topMatches[i].seq; }
+
+ topMatches.resize(numWanted);
return topMatches;
//***************************************************************************************************************
//this is a vertical soft filter
-void Chimera::createFilter(vector<Sequence*> seqs) {
+string Chimera::createFilter(vector<Sequence*> seqs, float t) {
try {
filterString = "";
- int threshold = int (0.5 * seqs.size());
+ int threshold = int (t * seqs.size());
//cout << "threshhold = " << threshold << endl;
vector<int> gaps; gaps.resize(seqs[0]->getAligned().length(), 0);
//cout << "filter = " << filterString << endl;
mothurOut("Filter removed " + toString(numColRemoved) + " columns."); mothurOutEndLine();
+ return filterString;
}
catch(exception& e) {
errorOut(e, "Chimera", "createFilter");
}
}
//***************************************************************************************************************
-void Chimera::runFilter(Sequence* seq) {
+map<int, int> Chimera::runFilter(Sequence* seq) {
try {
-
+ map<int, int> maskMap;
string seqAligned = seq->getAligned();
string newAligned = "";
+ int count = 0;
for (int j = 0; j < seqAligned.length(); j++) {
//if this spot is a gap
- if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+ if (filterString[j] == '1') {
+ newAligned += seqAligned[j];
+ maskMap[count] = j;
+ count++;
+ }
}
seq->setAligned(newAligned);
+
+ return maskMap;
}
catch(exception& e) {
errorOut(e, "Chimera", "runFilter");
int nastRegionStart;
int nastRegionEnd;
string parent;
+ string parentAligned;
float queryToParent;
float queryToParentLocal;
float divR;
struct SeqDist {
Sequence* seq;
float dist;
+ int index;
};
//********************************************************************************************************************
//sorts lowest to highest
Chimera(){};
Chimera(string);
- Chimera(string, bool);
+ Chimera(string, bool, string);
Chimera(string, string);
virtual ~Chimera(){};
virtual void setFilter(bool f) { filter = f; }
virtual vector<Sequence*> readSeqs(string);
virtual vector< vector<float> > readQuantiles();
virtual void setMask(string);
- virtual void runFilter(Sequence*);
- virtual void createFilter(vector<Sequence*>);
+ virtual map<int, int> runFilter(Sequence*);
+ virtual string createFilter(vector<Sequence*>, float);
virtual void printHeader(ostream&){};
virtual int getChimeras(Sequence*){ return 0; }
temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; }
realign = isTrue(temp);
- search = validParameter.validFile(parameters, "search", false); if (search == "not found") { temp = "distance"; }
+ search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; }
temp = validParameter.validFile(parameters, "iters", false);
if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }
else if (temp == "not found") { temp = "20"; }
convert(temp, numwanted);
-
+ if ((search != "distance") && (search != "blast") && (search != "kmer")) { mothurOut(search + " is not a valid search."); mothurOutEndLine(); abort = true; }
if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true; }
mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
- mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance. -used only by chimeraslayer. \n");
+ mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. -used only by chimeraslayer. \n");
mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false. -used only by chimeraslayer. \n");
mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
mothurOut("Details for each method: \n");
else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); }
else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); }
else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); }
- else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign); }
+ else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign, fastafile); }
else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
//set user options
// else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
appendOutputFiles(tempHeader, outputFileName);
- remove(tempHeader.c_str());
+ remove(outputFileName.c_str());
+ rename(tempHeader.c_str(), outputFileName.c_str());
for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
#include "chimeraslayer.h"
#include "chimerarealigner.h"
+#include "kmerdb.hpp"
//***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) { decalc = new DeCalculator(); }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) {
+ decalc = new DeCalculator();
+}
//***************************************************************************************************************
-ChimeraSlayer::~ChimeraSlayer() { delete decalc; }
+void ChimeraSlayer::doPrep() {
+ try {
+
+ string kmerDBNameLeft;
+ string kmerDBNameRight;
+
+ //generate the kmerdb to pass to maligner
+ if (searchMethod == "kmer") {
+ //leftside
+ string leftTemplateFileName = "left." + templateFileName;
+ databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
+ kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
+
+ if(!kmerFileTestLeft){
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ string leftFrag = templateSeqs[i]->getUnaligned();
+ leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+
+ Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
+ databaseLeft->addSequence(leftTemp);
+ }
+ databaseLeft->generateDB();
+
+ }else {
+ databaseLeft->readKmerDB(kmerFileTestLeft);
+ }
+ kmerFileTestLeft.close();
+
+ databaseLeft->setNumSeqs(templateSeqs.size());
+
+ //rightside
+ string rightTemplateFileName = "right." + templateFileName;
+ databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
+ kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTestRight(kmerDBNameRight.c_str());
+
+ if(!kmerFileTestRight){
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ string rightFrag = templateSeqs[i]->getUnaligned();
+ rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+
+ Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
+ databaseRight->addSequence(rightTemp);
+ }
+ databaseRight->generateDB();
+
+ }else {
+ databaseRight->readKmerDB(kmerFileTestRight);
+ }
+ kmerFileTestRight.close();
+
+ databaseRight->setNumSeqs(templateSeqs.size());
+
+ }
+
+ int start = time(NULL);
+ //filter the sequences
+ //read in all query seqs
+ ifstream in;
+ openInputFile(fastafile, in);
+
+ vector<Sequence*> tempQuerySeqs;
+ while(!in.eof()){
+ Sequence* s = new Sequence(in);
+ gobble(in);
+
+ if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+ }
+ in.close();
+
+ vector<Sequence*> temp = templateSeqs;
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
+
+ createFilter(temp, 0.0); //just removed columns where all seqs have a gap
+
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
+
+ //run filter on template
+ for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
+
+ mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to filter."); mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraSlayer", "doprep");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+ChimeraSlayer::~ChimeraSlayer() { delete decalc; if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } }
//***************************************************************************************************************
void ChimeraSlayer::printHeader(ostream& out) {
mothurOutEndLine();
- mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
+ mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
mothurOutEndLine();
+
+ out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
}
//***************************************************************************************************************
void ChimeraSlayer::print(ostream& out) {
mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine();
}
}
- out << querySeq->getName() << "\tyes" << endl;
+
printBlock(chimeraResults[0], out);
out << endl;
}else { out << querySeq->getName() << "\tno" << endl; }
int ChimeraSlayer::getChimeras(Sequence* query) {
try {
chimeraFlags = "no";
- querySeq = query;
- for (int i = 0; i < query->getAligned().length(); i++) {
- spotMap[i] = i;
- }
+ //filter query
+ spotMap = runFilter(query);
+
+ querySeq = query;
//referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
- maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod);
+ maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
string chimeraFlag = maligner->getResults(query, decalc);
vector<results> Results = maligner->getOutput();
-
- //realign query to parents to improve slayers detection rate???
+
+ //found in testing realigning only made things worse
if (realign) {
ChimeraReAligner realigner(templateSeqs, match, misMatch);
realigner.reAlign(query, Results);
}
- //if (chimeraFlag == "yes") {
-
- //get sequence that were given from maligner results
- vector<SeqDist> seqs;
- map<string, float> removeDups;
- map<string, float>::iterator itDup;
- for (int j = 0; j < Results.size(); j++) {
- float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
- //only add if you are not a duplicate
- itDup = removeDups.find(Results[j].parent);
- if (itDup == removeDups.end()) { //this is not duplicate
- removeDups[Results[j].parent] = dist;
- }else if (dist > itDup->second) { //is this a stronger number for this parent
- removeDups[Results[j].parent] = dist;
- }
- }
-
- for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
- Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
-
- SeqDist member;
- member.seq = seq;
- member.dist = itDup->second;
+ if (chimeraFlag == "yes") {
- seqs.push_back(member);
- }
-
- //limit number of parents to explore - default 3
- if (Results.size() > parents) {
- //sort by distance
- sort(seqs.begin(), seqs.end(), compareSeqDist);
- //prioritize larger more similiar sequence fragments
- reverse(seqs.begin(), seqs.end());
+ //get sequence that were given from maligner results
+ vector<SeqDist> seqs;
+ map<string, float> removeDups;
+ map<string, float>::iterator itDup;
+ map<string, string> parentNameSeq;
+ map<string, string>::iterator itSeq;
+ for (int j = 0; j < Results.size(); j++) {
+ float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+ //only add if you are not a duplicate
+ itDup = removeDups.find(Results[j].parent);
+ if (itDup == removeDups.end()) { //this is not duplicate
+ removeDups[Results[j].parent] = dist;
+ parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+ }else if (dist > itDup->second) { //is this a stronger number for this parent
+ removeDups[Results[j].parent] = dist;
+ parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+ }
+ }
- for (int k = seqs.size()-1; k > (parents-1); k--) {
- delete seqs[k].seq;
- seqs.pop_back();
+ for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+ //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
+ itSeq = parentNameSeq.find(itDup->first);
+//cout << itDup->first << itSeq->second << endl;
+ Sequence* seq = new Sequence(itDup->first, itSeq->second);
+
+ SeqDist member;
+ member.seq = seq;
+ member.dist = itDup->second;
+
+ seqs.push_back(member);
}
- }
-
- //put seqs into vector to send to slayer
- vector<Sequence*> seqsForSlayer;
- for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
- //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl;
-//ofstream out;
-//string name = querySeqs[i]->getName();
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = toString(i+1) + ".seqsforslayer";
-//openOutputFile(filename, out);
-//cout << querySeqs[i]->getName() << endl;
-//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out); }
-//cout << endl;
-//out.close();
-//filename = toString(i+1) + ".fasta";
-//openOutputFile(filename, out);
-//querySeqs[i]->printSequence(out);
-//out.close();
-
-
- //mask then send to slayer...
- if (seqMask != "") {
- decalc->setMask(seqMask);
+ //limit number of parents to explore - default 3
+ if (Results.size() > parents) {
+ //sort by distance
+ sort(seqs.begin(), seqs.end(), compareSeqDist);
+ //prioritize larger more similiar sequence fragments
+ reverse(seqs.begin(), seqs.end());
+
+ for (int k = seqs.size()-1; k > (parents-1); k--) {
+ delete seqs[k].seq;
+ seqs.pop_back();
+ }
+ }
- //mask querys
- decalc->runMask(query);
+ //put seqs into vector to send to slayer
+ vector<Sequence*> seqsForSlayer;
+ for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
- //mask parents
- for (int k = 0; k < seqsForSlayer.size(); k++) {
- decalc->runMask(seqsForSlayer[k]);
+ //mask then send to slayer...
+ if (seqMask != "") {
+ decalc->setMask(seqMask);
+
+ //mask querys
+ decalc->runMask(query);
+
+ //mask parents
+ for (int k = 0; k < seqsForSlayer.size(); k++) {
+ decalc->runMask(seqsForSlayer[k]);
+ }
+
+ spotMap = decalc->getMaskMap();
}
- spotMap = decalc->getMaskMap();
+ //send to slayer
+ chimeraFlags = slayer->getResults(query, seqsForSlayer);
+ chimeraResults = slayer->getOutput();
+
+ //free memory
+ for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
}
- //send to slayer
- chimeraFlags = slayer->getResults(query, seqsForSlayer);
- chimeraResults = slayer->getOutput();
-
- //free memory
- for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
- //}
-
+ delete maligner;
+ delete slayer;
+
return 0;
}
catch(exception& e) {
//***************************************************************************************************************
void ChimeraSlayer::printBlock(data_struct data, ostream& out){
try {
+ //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+
+ out << querySeq->getName() << '\t';
+ out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
+ //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+ //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+
+ out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
+ out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
- out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl;
- out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
- out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+ out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
- out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl;
- out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl;
+ //out << "Similarity of parents: " << data.ab << endl;
+ //out << "Similarity of query to parentA: " << data.qa << endl;
+ //out << "Similarity of query to parentB: " << data.qb << endl;
- out << "Similarity of parents: " << data.ab << endl;
- out << "Similarity of query to parentA: " << data.qa << endl;
- out << "Similarity of query to parentB: " << data.qb << endl;
- out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl;
- out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl;
//out << "Per_id(QL,A): " << data.qla << endl;
//out << "Per_id(QL,B): " << data.qlb << endl;
//out << "Per_id(QR,A): " << data.qra << endl;
//out << "Per_id(QR,B): " << data.qrb << endl;
- out << "DeltaL: " << (data.qla - data.qlb) << endl;
- out << "DeltaR: " << (data.qra - data.qrb) << endl;
+ //out << "DeltaL: " << (data.qla - data.qlb) << endl;
+ //out << "DeltaR: " << (data.qra - data.qrb) << endl;
}
catch(exception& e) {
class ChimeraSlayer : public Chimera {
public:
- ChimeraSlayer(string, bool);
+ ChimeraSlayer(string, bool, string);
~ChimeraSlayer();
int getChimeras(Sequence*);
void print(ostream&);
void printHeader(ostream&);
+ void doPrep();
private:
Sequence* querySeq;
Maligner* maligner;
Slayer* slayer;
map<int, int> spotMap;
+ Database* databaseRight;
+ Database* databaseLeft;
vector<data_struct> chimeraResults;
- string chimeraFlags, searchMethod;
- string fastafile;
+ string chimeraFlags, searchMethod, fastafile;
bool realign;
void printBlock(data_struct, ostream&);
+
};
/************************************************************************/
for (int i=nColCells-1;i>=0;i--) {
if (found[i] == 0) {
removeCell(colCells[i], -1, i);
+cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl;
+ if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
+ mothurOut("Warning: merging " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
+ }
}
}
}
#include "chimera.h"
#include "dist.h"
#include "eachgapdist.h"
+#include "ignoregaps.h"
//***************************************************************************************************************
}
//***************************************************************************************************************
//gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
try {
+ indexes.clear();
vector<Sequence*> seqsMatches;
- vector<SeqDist> dists;
+ vector<SeqDist> distsLeft;
+ vector<SeqDist> distsRight;
Dist* distcalculator = new eachGapDist();
string queryAligned = querySeq->getAligned();
//left side
+ bool foundFirstBase = false;
int baseCount = 0;
int leftSpot = 0;
+ int firstBaseSpot = 0;
for (int i = 0; i < queryAligned.length(); i++) {
- leftQuery += queryAligned[i];
//if you are a base
- if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) { baseCount++; }
+ if (isalpha(queryAligned[i])) {
+ baseCount++;
+ if (!foundFirstBase) { foundFirstBase = true; firstBaseSpot = i; }
+ }
+
+ //eliminate opening .'s
+ if (foundFirstBase) { leftQuery += queryAligned[i]; }
//if you have 1/3
if (baseCount >= numBases) { leftSpot = i; break; } //first 1/3
}
int rightSpot = 0;
for (int i = leftSpot; i < queryAligned.length(); i++) {
//if you are a base
- if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) { baseCount++; }
+ if (isalpha(queryAligned[i])) { baseCount++; }
//if you have 1/3
if (baseCount >= numBases) { rightSpot = i; break; } //last 1/3
}
- rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end
+
+ //trim end
+ //find last position in query that is a non gap character
+ int lastBaseSpot = queryAligned.length()-1;
+ for (int j = queryAligned.length()-1; j >= 0; j--) {
+ if (isalpha(queryAligned[j])) {
+ lastBaseSpot = j;
+ break;
+ }
+ }
+ rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
Sequence queryLeft(querySeq->getName(), leftQuery);
Sequence queryRight(querySeq->getName(), rightQuery);
-//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl;
+//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
//cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
for(int j = 0; j < db.size(); j++){
string dbAligned = db[j]->getAligned();
- string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence
- string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence
+ string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
+ string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
Sequence dbLeft(db[j]->getName(), leftDB);
Sequence dbRight(db[j]->getName(), rightDB);
distcalculator->calcDist(queryRight, dbRight);
float distRight = distcalculator->getDist();
- float dist = min(distLeft, distRight); //keep smallest distance
+ SeqDist subjectLeft;
+ subjectLeft.seq = db[j];
+ subjectLeft.dist = distLeft;
+ subjectLeft.index = j;
+
+ distsLeft.push_back(subjectLeft);
- SeqDist subject;
- subject.seq = db[j];
- subject.dist = dist;
+ SeqDist subjectRight;
+ subjectRight.seq = db[j];
+ subjectRight.dist = distRight;
+ subjectRight.index = j;
- dists.push_back(subject);
+ distsRight.push_back(subjectRight);
+
}
delete distcalculator;
- sort(dists.begin(), dists.end(), compareSeqDist);
+ //sort by smallest distance
+ sort(distsRight.begin(), distsRight.end(), compareSeqDist);
+ sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
+
+ //merge results
+ map<string, string> seen;
+ map<string, string>::iterator it;
+ vector<SeqDist> dists;
+ float lastRight = distsRight[0].dist;
+ float lastLeft = distsLeft[0].dist;
+ int lasti = 0;
+ for (int i = 0; i < distsLeft.size(); i++) {
+ //add left if you havent already
+ it = seen.find(distsLeft[i].seq->getName());
+ if (it == seen.end()) {
+ dists.push_back(distsLeft[i]);
+ seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
+ lastLeft = distsLeft[i].dist;
+ }
+
+ //add right if you havent already
+ it = seen.find(distsRight[i].seq->getName());
+ if (it == seen.end()) {
+ dists.push_back(distsRight[i]);
+ seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
+ lastRight = distsRight[i].dist;
+ }
+
+ if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
+ }
+
+ //add in dups
+ lasti++;
+ int i = lasti;
+ while (i < distsLeft.size()) {
+ if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); numWanted++; }
+ else { break; }
+ i++;
+ }
+
+ i = lasti;
+ while (i < distsRight.size()) {
+ if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); numWanted++; }
+ else { break; }
+ i++;
+ }
+
+//cout << numWanted << endl;
for (int i = 0; i < numWanted; i++) {
+//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
seqsMatches.push_back(temp);
+ indexes.push_back(dists[i].index);
}
return seqsMatches;
exit(1);
}
}
+//***************************************************************************************************************
+Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
+ try {
+
+ Sequence* seqsMatch;
+
+ Dist* distcalculator = new eachGapDist();
+ int index = 0;
+ int smallest = 1000000;
+
+ for(int j = 0; j < db.size(); j++){
+
+ distcalculator->calcDist(*querySeq, *db[j]);
+ float dist = distcalculator->getDist();
+
+ if (dist < smallest) {
+ smallest = dist;
+ index = j;
+ }
+ }
+
+ delete distcalculator;
+
+ seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
+
+ return seqsMatch;
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "findClosest");
+ exit(1);
+ }
+}
/***************************************************************************************************************/
map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
try {
//check to make sure that is not whole seq
if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); }
-//cout << "front = " << frontPos << " rear = " << rearPos << endl;
+//cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;
//trim query
string newAligned = query->getAligned();
- newAligned = newAligned.substr(frontPos, (rearPos-frontPos));
+ newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
query->setAligned(newAligned);
//trim topMatches
DeCalculator() {};
~DeCalculator() {};
- vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int); //takes querySeq, a reference db and numWanted - returns indexes to closest seqs in db
+ vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int&, vector<int>&); //takes querySeq, a reference db, numWanted and indexes
+ Sequence* findClosest(Sequence*, vector<Sequence*>);
set<int> getPos() { return h; }
void setMask(string);
void setAlignmentLength(int l) { alignLength = l; }
*/
#include "maligner.h"
-#include "database.hpp"
+#include "kmerdb.hpp"
#include "blastdb.hpp"
/***********************************************************************/
-Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
- db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
+Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
+ db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) {}
/***********************************************************************/
string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
try {
string chimera;
- if (searchMethod != "blast") {
+ if (searchMethod == "distance") {
//find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
- refSeqs = decalc->findClosest(query, db, numWanted);
- }else{
- refSeqs = getBlastSeqs(query, numWanted);
- }
+ refSeqs = decalc->findClosest(query, db, numWanted, indexes);
+ }else if (searchMethod == "blast") {
+ refSeqs = getBlastSeqs(query, numWanted); //fills indexes
+ }else if (searchMethod == "kmer") {
+ refSeqs = getKmerSeqs(query, numWanted); //fills indexes
+ }else { mothurOut("not valid search."); exit(1); } //should never get here
refSeqs = minCoverageFilter(refSeqs);
-
+
if (refSeqs.size() < 2) {
for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
percentIdenticalQueryChimera = 0.0;
}
int chimeraPenalty = computeChimeraPenalty();
-
+
//fills outputResults
chimera = chimeraMaligner(chimeraPenalty, decalc);
-
+
//free memory
delete query;
+
for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
return chimera;
//trims seqs to first non gap char in all seqs and last non gap char in all seqs
spotMap = decalc->trimSeqs(query, refSeqs);
-
+
vector<Sequence*> temp = refSeqs;
temp.push_back(query);
verticalFilter(temp);
+//for (int i = 0; i < temp.size(); i++) { cout << temp[i]->getName() << '\n' << temp[i]->getAligned() << endl; } return "no";
vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
-
+
vector<score_struct> path = extractHighestPath(matrix);
vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
-
+
if (trace.size() > 1) { chimera = "yes"; }
else { chimera = "no"; }
int traceStart = path[0].col;
- int traceEnd = path[path.size()-1].col;
-
+ int traceEnd = path[path.size()-1].col;
string queryInRange = query->getAligned();
queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
string chimeraSeq = constructChimericSeq(trace, refSeqs);
-
+
percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
//save output results
results temp;
temp.parent = refSeqs[seqIndex]->getName();
+ temp.parentAligned = db[indexes[seqIndex]]->getAligned();
temp.nastRegionStart = spotMap[regionStart];
temp.nastRegionEnd = spotMap[regionEnd];
temp.regionStart = regionStart;
//***************************************************************************************************************
vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
try {
+ indexes.clear();
+ vector<Sequence*> refResults;
//generate blastdb
Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); }
vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
+ //if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1))) { mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); mothurOutEndLine(); return refResults; }
+
+ vector<int> smaller;
+ vector<int> larger;
+
+ if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; larger = tempIndexesLeft; }
+ else { smaller = tempIndexesLeft; larger = tempIndexesRight; }
+
+ //merge results
+ map<int, int> seen;
+ map<int, int>::iterator it;
+
+ vector<int> mergedResults;
+ for (int i = 0; i < smaller.size(); i++) {
+ //add left if you havent already
+ it = seen.find(smaller[i]);
+ if (it == seen.end()) {
+ mergedResults.push_back(smaller[i]);
+ seen[smaller[i]] = smaller[i];
+ }
+
+ //add right if you havent already
+ it = seen.find(larger[i]);
+ if (it == seen.end()) {
+ mergedResults.push_back(larger[i]);
+ seen[larger[i]] = larger[i];
+ }
+ }
+
+ for (int i = smaller.size(); i < larger.size(); i++) {
+ it = seen.find(larger[i]);
+ if (it == seen.end()) {
+ mergedResults.push_back(larger[i]);
+ seen[larger[i]] = larger[i];
+ }
+ }
+
+ if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
+//cout << q->getName() << endl;
+ for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;
+ if (db[mergedResults[i]]->getName() != q->getName()) {
+ Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
+ refResults.push_back(temp);
+ indexes.push_back(mergedResults[i]);
+ }
+//cout << mergedResults[i] << endl;
+ }
+//cout << endl;
+ delete queryRight;
+ delete queryLeft;
+ delete database;
+
+ return refResults;
+ }
+ catch(exception& e) {
+ errorOut(e, "Maligner", "getBlastSeqs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
+ try {
+ indexes.clear();
+
+ //get parts of query
+ string queryUnAligned = q->getUnaligned();
+ string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+ string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
+ Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+ Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+
+ vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
+ vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
+
//merge results
map<int, int> seen;
map<int, int>::iterator it;
}
}
-
+//cout << q->getName() << endl;
vector<Sequence*> refResults;
for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;
Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
refResults.push_back(temp);
+ indexes.push_back(mergedResults[i]);
}
-
+//cout << endl;
delete queryRight;
delete queryLeft;
- delete database;
return refResults;
}
exit(1);
}
}
-
//***************************************************************************************************************
#include "decalc.h"
#include "chimera.h"
+#include "database.hpp"
/***********************************************************************/
//This class was modeled after the chimeraMaligner written by the Broad Institute
public:
- Maligner(vector<Sequence*>, int, int, int, float, int, int, string);
+ Maligner(vector<Sequence*>, int, int, int, float, int, int, string, Database*, Database*);
~Maligner() {};
string getResults(Sequence*, DeCalculator*);
string searchMethod;
float minDivR, percentIdenticalQueryChimera;
vector<results> outputResults;
+ vector<int> indexes; //stores index into template seqs of the refSeqs, so we can return the whole sequence rather than the trimmed and filtered one
map<int, int> spotMap;
+ Database* databaseLeft;
+ Database* databaseRight;
vector<Sequence*> minCoverageFilter(vector<Sequence*>); //removes top matches that do not have minimum coverage with query.
int computeChimeraPenalty();
float computePercentID(string, string);
string chimeraMaligner(int, DeCalculator*);
vector<Sequence*> getBlastSeqs(Sequence*, int);
+ vector<Sequence*> getKmerSeqs(Sequence*, int);
};
input = argv[1];
if (input[0] == '#') {
+ mothurOutJustToLog("Script Mode");
+ mothurOutEndLine(); mothurOutEndLine();
+
mothur = new ScriptEngine(argv[0], argv[1]);
}else{
+ mothurOutJustToLog("Batch Mode");
+ mothurOutEndLine(); mothurOutEndLine();
+
mothur = new BatchEngine(argv[0], argv[1]);
}
}
else{
+ mothurOutJustToLog("Interactive Mode");
+ mothurOutEndLine(); mothurOutEndLine();
+
mothur = new InteractEngine(argv[0]);
}
//***************************************************************************************************************
void Pintail::doPrep() {
try {
-
+
+ mergedFilterString = "";
windowSizesTemplate.resize(templateSeqs.size(), window);
quantiles.resize(100); //one for every percent mismatch
quantilesMembers.resize(100); //one for every percent mismatch
mothurOut("Done."); mothurOutEndLine();
}else { probabilityProfile = readFreq(); mothurOut("Done."); }
mothurOutEndLine();
-
- //quantiles are used to determine whether the de values found indicate a chimera
- //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
- //combination of sequences in the template
- if (quanfile != "") { quantiles = readQuantiles(); }
- else {
+
+ //make P into Q
+ for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
+
+ bool reRead = false;
+ //create filter if needed for later
+ if (filter) {
+ reRead = true;
- //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
if (seqMask != "") {
//mask templates
for (int i = 0; i < templateSeqs.size(); i++) {
}
}
- if (filter) {
- createFilter(templateSeqs);
- for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
+ //read in all query seqs
+ ifstream in;
+ openInputFile(fastafile, in);
+
+ vector<Sequence*> tempQuerySeqs;
+ while(!in.eof()){
+ Sequence* s = new Sequence(in);
+ gobble(in);
+
+ if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+ }
+ in.close();
+
+ vector<Sequence*> temp;
+ //merge query seqs and template seqs
+ temp = templateSeqs;
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
+
+ mergedFilterString = createFilter(temp, 0.5);
+
+ //reread template seqs
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
+ }
+
+
+ //quantiles are used to determine whether the de values found indicate a chimera
+ //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
+ //combination of sequences in the template
+ if (quanfile != "") {
+ quantiles = readQuantiles();
+ }else {
+ if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
+ reRead = true;
+ //mask templates
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ decalc->runMask(templateSeqs[i]);
+ }
}
-
mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
if (processors == 1) {
}
mothurOut("Done."); mothurOutEndLine();
-
- //if mask reread template
- if ((seqMask != "") || (filter)) {
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
- templateSeqs = readSeqs(templateFileName);
- }
-
}
+ if (reRead) {
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ templateSeqs.clear();
+ templateSeqs = readSeqs(templateFileName);
+ }
+
+
//free memory
for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
querySeq = query;
trimmed.clear();
windowSizes = window;
-
- //find pairs
- bestfit = findPairs(query);
- //make P into Q
- for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
-
- //mask sequences if the user wants to
+ //find pairs has to be done before a mask
+ bestfit = findPairs(query);
+
+ //if they mask
if (seqMask != "") {
decalc->runMask(query);
decalc->runMask(bestfit);
}
-
- if (filter) {
+
+ if (filter) { //must be done after a mask
runFilter(query);
runFilter(bestfit);
}
+
//trim seq
decalc->trimSeqs(query, bestfit, trimmed);
Sequence* seqsMatches;
- vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
- seqsMatches = copy[0];
-
+ seqsMatches = decalc->findClosest(q, templateSeqs);
return seqsMatches;
}
vector< vector<float> > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
vector< vector<quanMember> > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
set<int> h;
+ string mergedFilterString;
vector<float> readFreq();
else {
//valid paramters for this command
- string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir" };
+ string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir", "dups" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
if (listfile == "not open") { abort = true; }
else if (listfile == "not found") { listfile = ""; }
+ string usedDups = "true";
+ string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; }
+ dups = isTrue(temp);
+
if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "")) { mothurOut("You must provide one of the following: fasta, name, group, alignreport or list."); mothurOutEndLine(); abort = true; }
int okay = 2;
if (outputDir != "") { okay++; }
+ if (usedDups != "") { okay++; }
+
+ if ((usedDups != "") && (namefile == "")) { mothurOut("You may only use dups with the name option."); mothurOutEndLine(); abort = true; }
if (parameters.size() > okay) { mothurOut("You may only enter one of the following: fasta, name, group, alignreport, or list."); mothurOutEndLine(); abort = true; }
}
try {
mothurOut("The remove.seqs command reads an .accnos file and one of the following file types: fasta, name, group, list or alignreport file.\n");
mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n");
- mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list and alignreport. You must provide accnos and one of the other parameters.\n");
+ mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, alignreport and dups. You must provide accnos and one of the file parameters.\n");
+ mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. If dups=true, then remove.seqs outputs a new .accnos file containing all the sequences removed. \n");
mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n");
mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n");
mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
try {
if (outputDir == "") { outputDir += hasPath(namefile); }
string outputFileName = getRootName(namefile) + "pick" + getExtension(namefile);
+ string outputFileName2 = getRootName(namefile) + "dups.accnos";
+ ofstream out2;
+ if (dups) { openOutputFile(outputFileName2, out2); }
+ bool wroteDups = false;
+
ofstream out;
openOutputFile(outputFileName, out);
}
}
- //if the name in the first column is in the set then print it and any other names in second column also in set
- if (names.count(firstCol) == 0) {
-
- wroteSomething = true;
-
- out << firstCol << '\t';
-
- //you know you have at least one valid second since first column is valid
- for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
- out << validSecond[validSecond.size()-1] << endl;
-
- //make first name in set you come to first column and then add the remaining names to second column
+ if ((dups) && (validSecond.size() != parsedNames.size())) {
+ wroteDups = true;
+ for (int i = 0; i < parsedNames.size(); i++) { out2 << parsedNames[i] << endl; }
}else {
-
- //you want part of this row
- if (validSecond.size() != 0) {
-
+ //if the name in the first column is in the set then print it and any other names in second column also in set
+ if (names.count(firstCol) == 0) {
+
wroteSomething = true;
- out << validSecond[0] << '\t';
-
+ out << firstCol << '\t';
+
//you know you have at least one valid second since first column is valid
for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
out << validSecond[validSecond.size()-1] << endl;
+
+ //make first name in set you come to first column and then add the remaining names to second column
+ }else {
+
+ //you want part of this row
+ if (validSecond.size() != 0) {
+
+ wroteSomething = true;
+
+ out << validSecond[0] << '\t';
+
+ //you know you have at least one valid second since first column is valid
+ for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
+ out << validSecond[validSecond.size()-1] << endl;
+ }
}
}
-
gobble(in);
}
in.close();
out.close();
+ if (dups) { out2.close(); }
+ if (wroteDups == false) {
+ remove(outputFileName2.c_str());
+ }
+
if (wroteSomething == false) {
mothurOut("Your file contains only sequences from the .accnos file."); mothurOutEndLine();
remove(outputFileName.c_str());
private:
set<string> names;
string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, outputDir;
- bool abort;
+ bool abort, dups;
void readFasta();
void readName();
for(int i=1;i<=hold;i++){
f >> inputData;
+
set(i, inputData);
}
+
}
catch(exception& e) {
errorOut(e, "SAbundVector", "SAbundVector");
try {
RAbundVector rav;
- for(int i=1;i<=data.size();i++){
+ for(int i=1;i < data.size();i++){
for(int j=0;j<data[i];j++){
rav.push_back(i);
}
}
sort(rav.rbegin(), rav.rend());
-
+
rav.setLabel(label);
return rav;
}
try {
RAbundVector rav;
- for(int i=1;i<=data.size();i++){
+ for(int i=1;i<data.size();i++){
for(int j=0;j<data[i].abundance;j++){
rav.push_back(i);
}
try {
SharedRAbundVector rav;
- for(int i=1;i<=data.size();i++){
+ for(int i=1;i<data.size();i++){
for(int j=0;j<data[i].abundance;j++){
rav.push_back(i, data[i].group);
}