From: SarahsWork Date: Mon, 18 Mar 2013 17:36:59 +0000 (-0400) Subject: working on dereplicate=t issue in chimera.slayer and chimera.perseus, added appendFil... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=859e3a473a3e63e0060c49be70b80f9289253da2 working on dereplicate=t issue in chimera.slayer and chimera.perseus, added appendFilesWithoutHeaders to mothurOut. fixed bug in make.contigs with headers. fixed name file overwrite in unique.seqs. fixed seq name mismatch in get.seqs if dups=f. added count file to seq. error command. --- diff --git a/chimera.h b/chimera.h index bb4c29c..e187bfc 100644 --- a/chimera.h +++ b/chimera.h @@ -165,7 +165,7 @@ class Chimera { #ifdef USE_MPI virtual Sequence print(MPI_File&, MPI_File&){ Sequence temp; return temp; } - virtual Sequence print(MPI_File&, MPI_File&, data_results, data_results){ Sequence temp; return temp; } + virtual Sequence print(MPI_File&, MPI_File&, data_results, data_results, bool&){ Sequence temp; return temp; } virtual int print(MPI_File&, MPI_File&, string){ return 0; } #endif diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index 66889f1..1835862 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -45,7 +45,7 @@ string ChimeraPerseusCommand::getHelpString(){ helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n"; - helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n"; + helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed.\n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -70,7 +70,8 @@ string ChimeraPerseusCommand::getOutputPattern(string type) { string pattern = ""; if (type == "chimera") { pattern = "[filename],perseus.chimeras"; } - else if (type == "accnos") { pattern = "[filename],perseus.accnos"; } + else if (type == "accnos") { pattern = "[filename],perseus.accnos"; } + else if (type == "count") { pattern = "[filename],perseus.pick.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -88,6 +89,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(){ vector tempOutNames; outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ChimeraPerseusCommand", "ChimeraPerseusCommand"); @@ -122,6 +124,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { vector tempOutNames; outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -496,6 +499,7 @@ int ChimeraPerseusCommand::execute(){ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); string outputFileName = getOutputFileName("chimera", variables); string accnosFileName = getOutputFileName("accnos", variables); + string newCountFile = ""; //string newFasta = m->getRootName(fastaFileNames[s]) + "temp"; @@ -519,6 +523,8 @@ int ChimeraPerseusCommand::execute(){ if (ct->hasGroupInfo()) { cparser = new SequenceCountParser(fastaFileNames[s], *ct); + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + newCountFile = getOutputFileName("count", variables); vector groups = cparser->getNamesOfGroups(); @@ -529,13 +535,51 @@ int ChimeraPerseusCommand::execute(){ m->openOutputFile(outputFileName, out); out.close(); m->openOutputFile(accnosFileName, out1); out1.close(); - if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } - else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, newCountFile, 0, groups.size(), groups); + if (dups) { + CountTable c; c.readTable(nameFile); + if (!m->isBlank(newCountFile)) { + ifstream in2; + m->openInputFile(newCountFile, in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + c.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(newCountFile); + c.printTable(newCountFile); + } + + } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, newCountFile, groups, groupFile, fastaFileNames[s], nameFile); } if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } map uniqueNames = cparser->getAllSeqsMap(); if (!dups) { numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + }else { + set doNotRemove; + CountTable c; c.readTable(newCountFile); + vector namesInTable = c.getNamesOfSeqs(); + for (int i = 0; i < namesInTable.size(); i++) { + int temp = c.getNumSeqs(namesInTable[i]); + if (temp == 0) { c.remove(namesInTable[i]); } + else { doNotRemove.insert((namesInTable[i])); } + } + //remove names we want to keep from accnos file. + set accnosNames = m->readAccnos(accnosFileName); + ofstream out2; + m->openOutputFile(accnosFileName, out2); + for (set::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { + if (doNotRemove.count(*it) == 0) { out2 << (*it) << endl; } + } + out2.close(); + c.printTable(newCountFile); + outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); + } delete cparser; @@ -567,8 +611,8 @@ int ChimeraPerseusCommand::execute(){ m->openOutputFile(outputFileName, out); out.close(); m->openOutputFile(accnosFileName, out1); out1.close(); - if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } - else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, "", 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, "", groups, groupFile, fastaFileNames[s], nameFile); } if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } map uniqueNames = parser->getAllSeqsMap(); @@ -652,11 +696,14 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){ } } //********************************************************************************************************************** -int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector groups){ +int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, string countlist, int start, int end, vector groups){ try { int totalSeqs = 0; int numChimeras = 0; + + ofstream outCountList; + if (hasCount && dups) { m->openOutputFile(countlist, outCountList); } for (int i = start; i < end; i++) { @@ -672,6 +719,39 @@ int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int s totalSeqs += numSeqs; if (m->control_pressed) { return 0; } + + if (dups) { + if (!m->isBlank(accnos+groups[i])) { + ifstream in; + m->openInputFile(accnos+groups[i], in); + string name; + if (hasCount) { + while (!in.eof()) { + in >> name; m->gobble(in); + outCountList << name << '\t' << groups[i] << endl; + } + in.close(); + }else { + map thisnamemap = parser->getNameMap(groups[i]); + map::iterator itN; + ofstream out; + m->openOutputFile(accnos+groups[i]+".temp", out); + while (!in.eof()) { + in >> name; m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); m->control_pressed = true; } + } + out.close(); + in.close(); + m->renameFile(accnos+groups[i]+".temp", accnos+groups[i]); + } + + } + } //append files m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i])); @@ -680,6 +760,8 @@ int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int s m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine(); } + if (hasCount && dups) { outCountList.close(); } + return totalSeqs; } @@ -978,13 +1060,16 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque } } /**************************************************************************************************/ -int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector groups, string group, string fasta, string name) { +int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, string newCountFile, vector groups, string group, string fasta, string name) { try { vector processIDS; int process = 1; int num = 0; + CountTable newCount; + if (hasCount && dups) { newCount.readTable(name); } + //sanity check if (groups.size() < processors) { processors = groups.size(); } @@ -1008,7 +1093,7 @@ int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accn processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", accnos + ".byCount." + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); //pass numSeqs to parent ofstream out; @@ -1026,7 +1111,7 @@ int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accn } //do my part - num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, accnos, accnos + ".byCount", lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;icount != pDataArray[i]->end) { - m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; - } num += pDataArray[i]->count; CloseHandle(hThreadArray[i]); delete pDataArray[i]; } #endif - + //read my own + if (hasCount && dups) { + if (!m->isBlank(accnos + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + ".byCount"); + } + //append output files for(int i=0;iappendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos); m->mothurRemove((accnos + toString(processIDS[i]) + ".temp")); + + if (hasCount && dups) { + if (!m->isBlank(accnos + ".byCount." + toString(processIDS[i]) + ".temp")) { + ifstream in2; + m->openInputFile(accnos + ".byCount." + toString(processIDS[i]) + ".temp", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + ".byCount." + toString(processIDS[i]) + ".temp"); + } + } + //print new *.pick.count_table + if (hasCount && dups) { newCount.printTable(newCountFile); } + return num; } diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index b3d6ccf..664200f 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -64,8 +64,8 @@ private: vector readFiles(string inputFile, CountTable* ct); vector loadSequences(string); int deconvoluteResults(map&, string, string); - int driverGroups(string, string, int, int, vector); - int createProcessesGroups(string, string, vector, string, string, string); + int driverGroups(string, string, string, int, int, vector); + int createProcessesGroups(string, string, string, vector, string, string, string); string removeNs(string); }; @@ -79,16 +79,17 @@ struct perseusData { string groupfile; string outputFName; string accnos; + string countlist; MothurOut* m; int start; int end; - bool hasName, hasCount; + bool hasName, hasCount, dups; int threadID, count, numChimeras; double alpha, beta, cutoff; vector groups; perseusData(){} - perseusData(bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + perseusData(bool dps, bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, string ctlist, vector gr, MothurOut* mout, int st, int en, int tid) { alpha = a; beta = b; cutoff = c; @@ -96,6 +97,7 @@ struct perseusData { namefile = n; groupfile = g; outputFName = o; + countlist = ctlist; accnos = ac; m = mout; start = st; @@ -104,6 +106,7 @@ struct perseusData { groups = gr; hasName = hn; hasCount = hc; + dups = dps; count = 0; numChimeras = 0; } @@ -137,6 +140,9 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int totalSeqs = 0; int numChimeras = 0; + + ofstream outCountList; + if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); } for (int u = pDataArray->start; u < pDataArray->end; u++) { @@ -347,6 +353,39 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ //////////////////////////////////////////////////////////////////////////////////////// totalSeqs += numSeqs; + + if (pDataArray->dups) { + if (!pDataArray->m->isBlank(accnosFileName)) { + ifstream in; + pDataArray->m->openInputFile(accnosFileName, in); + string name; + if (pDataArray->hasCount) { + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + outCountList << name << '\t' << groups[u] << endl; + } + in.close(); + }else { + map thisnamemap = parser->getNameMap(groups[u]); + map::iterator itN; + ofstream out; + pDataArray->m->openOutputFile(accnosFileName+".temp", out); + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; pDataArray->m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; } + } + out.close(); + in.close(); + pDataArray->m->renameFile(accnosFileName+".temp", accnosFileName); + } + + } + } //append files pDataArray->m->appendFiles(chimeraFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(chimeraFileName); @@ -356,6 +395,8 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } } + if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); } + pDataArray->count = totalSeqs; if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 9ceba52..102db74 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -574,12 +574,13 @@ Sequence ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftPi #ifdef USE_MPI //*************************************************************************************************************** -Sequence ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results leftPiece, data_results rightPiece) { +Sequence ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results leftPiece, data_results rightPiece, bool& chimFlag) { try { MPI_Status status; bool results = false; string outAccString = ""; string outputString = ""; + chimFlag = false; Sequence trim; @@ -628,6 +629,7 @@ Sequence ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results left memcpy(buf2, outAccString.c_str(), length); MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status); + chimFlag = true; delete buf2; if (trimChimera) { diff --git a/chimeraslayer.h b/chimeraslayer.h index c409c50..7bf663a 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -40,7 +40,7 @@ class ChimeraSlayer : public Chimera { #ifdef USE_MPI Sequence print(MPI_File&, MPI_File&); - Sequence print(MPI_File&, MPI_File&, data_results, data_results); + Sequence print(MPI_File&, MPI_File&, data_results, data_results, bool&); #endif private: diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index a1be235..d8e0952 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -65,7 +65,7 @@ string ChimeraSlayerCommand::getHelpString(){ helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; - helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -110,7 +110,8 @@ string ChimeraSlayerCommand::getOutputPattern(string type) { if (type == "chimera") { pattern = "[filename],slayer.chimeras"; } else if (type == "accnos") { pattern = "[filename],slayer.accnos"; } - else if (type == "fasta") { pattern = "[filename],slayer.fasta"; } + else if (type == "fasta") { pattern = "[filename],slayer.fasta"; } + else if (type == "count") { pattern = "[filename],slayer.pick.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -129,6 +130,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(){ outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "ChimeraSlayerCommand"); @@ -165,6 +167,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -677,6 +680,7 @@ int ChimeraSlayerCommand::execute(){ string outputFileName = getOutputFileName("chimera", variables); string accnosFileName = getOutputFileName("accnos", variables); string trimFastaFileName = getOutputFileName("fasta", variables); + string newCountFile = ""; //clears files ofstream out, out1, out2; @@ -746,11 +750,36 @@ int ChimeraSlayerCommand::execute(){ if (m->control_pressed) { outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } #endif }else { //you have provided a groupfile + string countFile = ""; + if (hasCount) { + countFile = nameFileNames[s]; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFileNames[s])); + newCountFile = getOutputFileName("count", variables); + } #ifdef USE_MPI - MPIExecuteGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); + MPIExecuteGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); #else - if (processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); } - else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); } //destroys fileToPriority + if (processors == 1) { + numSeqs = driverGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile); + if (hasCount && dups) { + CountTable c; c.readTable(nameFileNames[s]); + if (!m->isBlank(newCountFile)) { + ifstream in2; + m->openInputFile(newCountFile, in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + c.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(newCountFile); + c.printTable(newCountFile); + } + + } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); } //destroys fileToPriority #endif #ifdef USE_MPI @@ -762,7 +791,29 @@ int ChimeraSlayerCommand::execute(){ if (!dups) { totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + }else { + if (hasCount) { + set doNotRemove; + CountTable c; c.readTable(newCountFile); + vector namesInTable = c.getNamesOfSeqs(); + for (int i = 0; i < namesInTable.size(); i++) { + int temp = c.getNumSeqs(namesInTable[i]); + if (temp == 0) { c.remove(namesInTable[i]); } + else { doNotRemove.insert((namesInTable[i])); } + } + //remove names we want to keep from accnos file. + set accnosNames = m->readAccnos(accnosFileName); + ofstream out2; + m->openOutputFile(accnosFileName, out2); + for (set::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { + if (doNotRemove.count(*it) == 0) { out2 << (*it) << endl; } + } + out2.close(); + c.printTable(newCountFile); + outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); + } } + #ifdef USE_MPI } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait @@ -800,7 +851,7 @@ int ChimeraSlayerCommand::execute(){ } } //********************************************************************************************************************** -int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosFileName, string trimFastaFileName, map >& fileToPriority, map& fileGroup){ +int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosFileName, string trimFastaFileName, map >& fileToPriority, map& fileGroup, string countlist, string countfile){ try { #ifdef USE_MPI int pid; @@ -826,6 +877,7 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF MPI_File outMPI; MPI_File outMPIAccnos; MPI_File outMPIFasta; + MPI_File outMPICount; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -838,12 +890,16 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF char outFastaFilename[1024]; strcpy(outFastaFilename, trimFastaFileName.c_str()); + + char outCountFilename[1024]; + strcpy(outCountFilename, countlist.c_str()); MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); if (trim) { MPI_File_open(MPI_COMM_WORLD, outFastaFilename, outMode, MPI_INFO_NULL, &outMPIFasta); } + if (hasCount && dups) { MPI_File_open(MPI_COMM_WORLD, outCountFilename, outMode, MPI_INFO_NULL, &outMPICount); } - if (m->control_pressed) { MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } + if (m->control_pressed) { MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); if (hasCount && dups) { MPI_File_close(&outMPICount); } return 0; } //print headers if (pid == 0) { //you are the root process @@ -874,16 +930,55 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF strcpy(inFileName, thisFastaName.c_str()); MPI_File inMPI; MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - + MPIPos = m->setFilePosFasta(thisFastaName, num); //fills MPIPos, returns numSeqs cout << endl << "Checking sequences from group: " << fileGroup[thisFastaName] << "." << endl; - driverMPI(0, num, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, thisFastaName, thisPriority, true); + set cnames; + driverMPI(0, num, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, thisFastaName, thisPriority, true); numSeqs += num; MPI_File_close(&inMPI); m->mothurRemove(thisFastaName); + + if (dups) { + if (cnames.size() != 0) { + if (hasCount) { + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + string outputString = (*it) + "\t" + fileGroup[thisFastaName] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + MPI_File_write_shared(outMPICount, buf2, length, MPI_CHAR, &status); + delete buf2; + } + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + itN = thisnamemap.find(*it); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { //write to accnos file + string outputString = tempNames[j] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(outMPIAccnos, buf2, length, MPI_CHAR, &status); + delete buf2; + } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + *it + ".\n"); m->control_pressed = true; } + } + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } cout << endl << "It took " << toString(time(NULL) - start) << " secs to check " + toString(num) + " sequences from group " << fileGroup[thisFastaName] << "." << endl; } @@ -899,6 +994,7 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); if (trim) { MPI_File_close(&outMPIFasta); } + if (hasCount && dups) { MPI_File_close(&outMPICount); } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait #endif @@ -984,7 +1080,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, inputFile, priority, false); + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } @@ -1000,7 +1097,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, inputFile, priority, false); + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } @@ -1258,6 +1356,7 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map thisGroupsSeqs = parser->getSeqs(groups[i]); map thisGroupsMap = parser->getNameMap(groups[i]); + group2NameMap[groups[i]] = thisGroupsMap; string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; priority = sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); fileToPriority[newFastaFile] = priority; @@ -1352,9 +1451,12 @@ string ChimeraSlayerCommand::getNamesFile(string& inputFile){ } //********************************************************************************************************************** -int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup){ +int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist){ try { int totalSeqs = 0; + ofstream outCountList; + + if (hasCount && dups) { m->openOutputFile(countlist, outCountList); } for (map >::iterator itFile = fileToPriority.begin(); itFile != fileToPriority.end(); itFile++) { @@ -1379,6 +1481,44 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string #endif int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority); + //if we provided a count file with group info and set dereplicate=t, then we want to create a *.pick.count_table + //This table will zero out group counts for seqs determined to be chimeric by that group. + if (dups) { + if (!m->isBlank(thisaccnosFileName)) { + ifstream in; + m->openInputFile(thisaccnosFileName, in); + string name; + if (hasCount) { + while (!in.eof()) { + in >> name; m->gobble(in); + outCountList << name << '\t' << fileGroup[thisFastaName] << endl; + } + in.close(); + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + ofstream out; + m->openOutputFile(thisaccnosFileName+".temp", out); + while (!in.eof()) { + in >> name; m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); m->control_pressed = true; } + } + out.close(); + in.close(); + m->renameFile(thisaccnosFileName+".temp", thisaccnosFileName); + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } + //append files m->appendFiles(thisoutputFileName, outputFName); m->mothurRemove(thisoutputFileName); m->appendFiles(thisaccnosFileName, accnos); m->mothurRemove(thisaccnosFileName); @@ -1390,6 +1530,8 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine(); } + if (hasCount && dups) { outCountList.close(); } + return totalSeqs; } catch(exception& e) { @@ -1398,13 +1540,16 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string } } /**************************************************************************************************/ -int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup) { +int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist, string countFile) { try { int process = 1; int num = 0; processIDS.clear(); if (fileToPriority.size() < processors) { processors = fileToPriority.size(); } + + CountTable newCount; + if (hasCount && dups) { newCount.readTable(countFile); } int groupsPerProcessor = fileToPriority.size() / processors; int remainder = fileToPriority.size() % processors; @@ -1436,7 +1581,7 @@ int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accno processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", breakUp[process], fileGroup); + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", breakUp[process], fileGroup, accnos + toString(getpid()) + ".byCount"); //pass numSeqs to parent ofstream out; @@ -1452,7 +1597,7 @@ int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accno } } - num = driverGroups(outputFName, accnos, fasta, breakUp[0], fileGroup); + num = driverGroups(outputFName, accnos, fasta, breakUp[0], fileGroup, accnos + ".byCount"); //force parent to wait until all the processes are done for (int i=0;icount != pDataArray[i]->end) { - m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + if (pDataArray[i]->fileToPriority.size() != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->end) + " of " + toString(pDataArray[i]->fileToPriority.size()) + " groups assigned to it, quitting. \n"); m->control_pressed = true; } num += pDataArray[i]->count; CloseHandle(hThreadArray[i]); delete pDataArray[i]; } #endif + //read my own + if (hasCount && dups) { + if (!m->isBlank(accnos + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + ".byCount"); + } + //append output files for(int i=0;iappendFiles((fasta + toString(processIDS[i]) + ".temp"), fasta); m->mothurRemove((fasta + toString(processIDS[i]) + ".temp")); } + + if (hasCount && dups) { + if (!m->isBlank(accnos + toString(processIDS[i]) + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + toString(processIDS[i]) + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + toString(processIDS[i]) + ".byCount"); + } + } + //print new *.pick.count_table + if (hasCount && dups) { newCount.printTable(countlist); } return num; } @@ -1667,7 +1846,7 @@ int ChimeraSlayerCommand::driver(linePair filePos, string outputFName, string fi } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector& MPIPos, string filename, map& priority, bool byGroup){ +int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, set& cnames, vector& MPIPos, string filename, map& priority, bool byGroup){ try { MPI_Status status; int pid; @@ -1751,7 +1930,10 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil data_results rightResults = chimera->getResults(); //if either piece is chimeric then report - Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults); + bool flag = false; + Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults, flag); + if (flag) { cnames.insert(candidateSeq->getName()); } + if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; @@ -1769,6 +1951,7 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil }else { //print results Sequence trimmed = chimera->print(outMPI, outAccMPI); + cnames.insert(candidateSeq->getName()); if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index ee43bcb..68e03b8 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -61,13 +61,13 @@ private: map priority; int setUpForSelfReference(SequenceParser*&, map&, map >&, int); int setUpForSelfReference(SequenceCountParser*&, map&, map >&, int); - int driverGroups(string, string, string, map >&, map&); - int createProcessesGroups(string, string, string, map >&, map&); - int MPIExecuteGroups(string, string, string, map >&, map&); + int driverGroups(string, string, string, map >&, map&, string); + int createProcessesGroups(string, string, string, map >&, map&, string, string); + int MPIExecuteGroups(string, string, string, map >&, map&, string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&, string, map&, bool); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, set&, vector&, string, map&, bool); #endif bool abort, realign, trim, trimera, save, hasName, hasCount, dups; @@ -75,6 +75,7 @@ private: int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; + map > group2NameMap; vector outputNames; vector fastaFileNames; vector nameFileNames; @@ -91,12 +92,12 @@ struct slayerData { string outputFName; string fasta; string accnos; - string filename; + string filename, countlist; string templatefile; string search; string blastlocation; bool trimera; - bool trim, realign; + bool trim, realign, dups, hasCount; unsigned long long start; unsigned long long end; int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted; @@ -108,6 +109,7 @@ struct slayerData { int threadId; map > fileToPriority; map fileGroup; + map > group2NameMap; slayerData(){} slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map prior, int tid) { @@ -142,13 +144,17 @@ struct slayerData { count = 0; numNoParents = 0; } - slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map >& fPriority, map& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map prior, int tid) { + slayerData(map > g2n, bool hc, bool dps, string cl, string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map >& fPriority, map& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map prior, int tid) { outputFName = o; fasta = fa; accnos = ac; templatefile = te; search = se; blastlocation = bl; + countlist = cl; + dups = dps; + hasCount = hc; + group2NameMap = g2n; trimera = tri; trim = trm; realign = re; @@ -351,8 +357,11 @@ static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){ pDataArray = (slayerData*)lpParam; try { - + ofstream outCountList; + if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); } + int totalSeqs = 0; + pDataArray->end = 0; for (map >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) { @@ -510,10 +519,51 @@ static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){ if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; - + pDataArray->end++; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //if we provided a count file with group info and set dereplicate=t, then we want to create a *.pick.count_table + //This table will zero out group counts for seqs determined to be chimeric by that group. + if (pDataArray->dups) { + if (!pDataArray->m->isBlank(thisaccnosFileName)) { + ifstream in; + pDataArray->m->openInputFile(thisaccnosFileName, in); + string name; + if (pDataArray->hasCount) { + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + outCountList << name << '\t' << pDataArray->fileGroup[thisFastaName] << endl; + } + in.close(); + }else { + map >::iterator itGroupNameMap = pDataArray->group2NameMap.find(pDataArray->fileGroup[thisFastaName]); + if (itGroupNameMap != pDataArray->group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + ofstream out; + pDataArray->m->openOutputFile(thisaccnosFileName+".temp", out); + while (!in.eof()) { + in >> name; pDataArray->m->gobble(in); + //pDataArray->m->mothurOut("here = " + name + '\t'); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; pDataArray->m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + //pDataArray->m->mothurOut(itN->second + '\n'); + + }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; } + } + out.close(); + in.close(); + pDataArray->m->renameFile(thisaccnosFileName+".temp", thisaccnosFileName); + }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + pDataArray->fileGroup[thisFastaName] + ".\n"); pDataArray->m->control_pressed = true; } + } + + } + } + + //append files pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName); pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName); @@ -526,6 +576,7 @@ static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){ } pDataArray->count = totalSeqs; + if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); } return 0; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index cb155eb..1d7e252 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -774,6 +774,7 @@ int ChimeraUchimeCommand::execute(){ } out2.close(); c.printTable(newCountFile); + outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); } } @@ -839,10 +840,6 @@ int ChimeraUchimeCommand::deconvoluteResults(map& uniqueNames, s map::iterator itUnique; int total = 0; - //edit accnos file - ifstream in2; - m->openInputFile(accnosFileName, in2); - ofstream out2; m->openOutputFile(accnosFileName+".temp", out2); @@ -852,27 +849,32 @@ int ChimeraUchimeCommand::deconvoluteResults(map& uniqueNames, s set chimerasInFile; set::iterator itChimeras; - - while (!in2.eof()) { - if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; } - - in2 >> name; m->gobble(in2); - - //find unique name - itUnique = uniqueNames.find(name); - - if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; } - else { - itChimeras = chimerasInFile.find((itUnique->second)); - - if (itChimeras == chimerasInFile.end()) { - out2 << itUnique->second << endl; - chimerasInFile.insert((itUnique->second)); - total++; - } - } - } - in2.close(); + if (!m->isBlank(accnosFileName)) { + //edit accnos file + ifstream in2; + m->openInputFile(accnosFileName, in2); + + while (!in2.eof()) { + if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(outputFileName); m->mothurRemove((accnosFileName+".temp")); return 0; } + + in2 >> name; m->gobble(in2); + + //find unique name + itUnique = uniqueNames.find(name); + + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else { + itChimeras = chimerasInFile.find((itUnique->second)); + + if (itChimeras == chimerasInFile.end()) { + out2 << itUnique->second << endl; + chimerasInFile.insert((itUnique->second)); + total++; + } + } + } + in2.close(); + } out2.close(); m->mothurRemove(accnosFileName); @@ -1181,6 +1183,7 @@ int ChimeraUchimeCommand::driverGroups(string outputFName, string filename, stri int totalSeqs = 0; int numChimeras = 0; + ofstream outCountList; if (hasCount && dups) { m->openOutputFile(countlist, outCountList); } diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 92f0e48..37e42d2 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -442,8 +442,7 @@ int ClusterSplitCommand::execute(){ if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); } //output a merged distance file - if (splitmethod == "fasta") { createMergedDistanceFile(distName); } - + //if (splitmethod == "fasta") { createMergedDistanceFile(distName); } if (m->control_pressed) { return 0; } diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index f33a140..09e077b 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -198,17 +198,24 @@ int DeconvoluteCommand::execute() { map nameMap; map::iterator itNames; if (oldNameMapFName != "") { - m->readNames(oldNameMapFName, nameMap); - if (oldNameMapFName == outNameFile){ - variables["[tag]"] = "unique"; - outNameFile = getOutputFileName("name", variables); } + m->readNames(oldNameMapFName, nameMap); + if (oldNameMapFName == outNameFile){ + //prepare filenames and open files + map mvariables; + mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName)); + mvariables["[tag]"] = "unique"; + outNameFile = getOutputFileName("name", mvariables); + } } CountTable ct; if (countfile != "") { ct.readTable(countfile); - if (countfile == outCountFile){ - variables["[tag]"] = "unique"; - outCountFile = getOutputFileName("count", variables); } + if (countfile == outCountFile){ + //prepare filenames and open files + map mvariables; + mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName)); + mvariables["[tag]"] = "unique"; + outCountFile = getOutputFileName("count", mvariables); } } if (m->control_pressed) { return 0; } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index 779ea91..480bde3 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -427,6 +427,11 @@ int GetSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } if (name != "") { //if this name is in the accnos file @@ -485,7 +490,12 @@ int GetSeqsCommand::readQual(){ string name = ""; string scores = ""; - in >> name; + in >> name; + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } if (name.length() != 0) { saveName = name.substr(1); @@ -748,6 +758,8 @@ int GetSeqsCommand::readName(){ wroteSomething = true; out << validSecond[0] << '\t'; + //we are changing the unique name in the fasta file + uniqueMap[firstCol] = validSecond[0]; //you know you have at least one valid second since first column is valid for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } @@ -805,6 +817,7 @@ int GetSeqsCommand::readGroup(){ in >> name; //read from first column in >> group; //read from second column + //if this name is in the accnos file if (names.count(name) != 0) { @@ -862,6 +875,11 @@ int GetSeqsCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } //if this name is in the accnos file if (names.count(name) != 0) { @@ -924,6 +942,11 @@ int GetSeqsCommand::readAlign(){ in >> name; //read from first column + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } //if this name is in the accnos file if (names.count(name) != 0) { diff --git a/getseqscommand.h b/getseqscommand.h index 42070e5..c5b6ca4 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -38,7 +38,7 @@ class GetSeqsCommand : public Command { vector outputNames; string accnosfile, accnosfile2, fastafile, namefile, countfile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; - + map uniqueMap; //for debug map > sanity; //maps file type to names chosen for file. something like "fasta" -> vector. If running in debug mode this is filled and we check to make sure all the files have the same names. If they don't we output the differences for the user. diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index c486637..7d7184f 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -21,8 +21,6 @@ vector MakeContigsCommand::setParameters(){ CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs); -// CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); -// CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "","",false,false); parameters.push_back(palign); @@ -56,7 +54,7 @@ string MakeContigsCommand::getHelpString(){ helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n"; helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n"; - helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; + helpString += "The file parameter is 2 or 3 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n"; helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n"; helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n"; @@ -121,7 +119,8 @@ MakeContigsCommand::MakeContigsCommand(){ //********************************************************************************************************************** MakeContigsCommand::MakeContigsCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + createFileGroup = false; createOligosGroup = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -392,33 +391,30 @@ int MakeContigsCommand::execute(){ m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n"); vector > fastaFileNames; - createGroup = false; + createOligosGroup = false; string outputGroupFileName; map variables; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir = m->hasPath(filesToProcess[l][0][0]); } variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0])); variables["[tag]"] = ""; - if(oligosfile != ""){ - createGroup = getOligos(fastaFileNames, variables["[filename]"]); - if (createGroup) { - outputGroupFileName = getOutputFileName("group",variables); - outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); - } + if(oligosfile != ""){ createOligosGroup = getOligos(fastaFileNames, variables["[filename]"]); } + if (createOligosGroup || createFileGroup) { + outputGroupFileName = getOutputFileName("group",variables); } + //give group in file file precedence + if (createFileGroup) { createOligosGroup = false; } + variables["[tag]"] = "trim"; string outFastaFile = getOutputFileName("fasta",variables); variables["[tag]"] = "scrap"; string outScrapFastaFile = getOutputFileName("fasta",variables); variables["[tag]"] = ""; string outMisMatchFile = getOutputFileName("report",variables); - outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); - outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); - outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); - + m->mothurOut("Making contigs...\n"); - createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames); + createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l); m->mothurOut("Done.\n"); //remove temp fasta and qual files @@ -473,7 +469,7 @@ int MakeContigsCommand::execute(){ } } - if (createGroup) { + if (createFileGroup || createOligosGroup) { ofstream outGroup; m->openOutputFile(outputGroupFileName, outGroup); for (map::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) { @@ -483,17 +479,34 @@ int MakeContigsCommand::execute(){ } if (filesToProcess.size() > 1) { //merge into large combo files - if (createGroup) { + if (createFileGroup || createOligosGroup) { if (l == 0) { ofstream outCGroup; m->openOutputFile(compositeGroupFile, outCGroup); outCGroup.close(); outputNames.push_back(compositeGroupFile); outputTypes["group"].push_back(compositeGroupFile); } - m->appendFiles(outputGroupFileName, compositeGroupFile); + m->appendFiles(outputGroupFileName, compositeGroupFile); + if (!allFiles) { m->mothurRemove(outputGroupFileName); } + else { outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } - m->appendFiles(outMisMatchFile, compositeMisMatchFile); + if (l == 0) { m->appendFiles(outMisMatchFile, compositeMisMatchFile); } + else { m->appendFilesWithoutHeaders(outMisMatchFile, compositeMisMatchFile); } m->appendFiles(outFastaFile, compositeFastaFile); m->appendFiles(outScrapFastaFile, compositeScrapFastaFile); + if (!allFiles) { + m->mothurRemove(outMisMatchFile); + m->mothurRemove(outFastaFile); + m->mothurRemove(outScrapFastaFile); + }else { + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); + outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); + } + }else { + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); + outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); + if (createFileGroup || createOligosGroup) { outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n"); @@ -597,10 +610,14 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l } } //********************************************************************************************************************** -int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames) { +int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int index) { try { int num = 0; vector processIDS; + string group = ""; + map::iterator it = file2Group.find(index); + if (it != file2Group.end()) { group = it->second; } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 0; @@ -631,14 +648,14 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o outputFasta + toString(getpid()) + ".temp", outputScrapFasta + toString(getpid()) + ".temp", outputMisMatches + toString(getpid()) + ".temp", - tempFASTAFileNames, process); + tempFASTAFileNames, process, group); //pass groupCounts to parent ofstream out; string tempFile = toString(getpid()) + ".num.temp"; m->openOutputFile(tempFile, out); out << num << endl; - if(createGroup){ + if (createFileGroup || createOligosGroup) { out << groupCounts.size() << endl; for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { @@ -665,7 +682,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o m->openOutputFile(outputScrapFasta, temp); temp.close(); //do my part - num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1); + num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1, group); //force parent to wait until all the processes are done for (int i=0;i > files, string o int tempNum; in >> tempNum; num += tempNum; m->gobble(in); - if(createGroup){ + if (createFileGroup || createOligosGroup) { string group; in >> tempNum; m->gobble(in); @@ -739,7 +756,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } - contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, barcodes, primers, tempFASTAFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, trimOverlap, h); + contigsData* tempcontig = new contigsData(group, files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, barcodes, primers, tempFASTAFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createOligosGroup, createFileGroup, allFiles, trimOverlap, h); pDataArray.push_back(tempcontig); hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); @@ -768,7 +785,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o //do my part processIDS.push_back(processors-1); - num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1); + num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1, group); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); @@ -802,7 +819,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta); m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp")); - m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); + m->appendFilesWithoutHeaders((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp")); if(allFiles){ @@ -825,7 +842,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } } //********************************************************************************************************************** -int MakeContigsCommand::driver(vector files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int process){ +int MakeContigsCommand::driver(vector files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int process, string group){ try { Alignment* alignment; @@ -851,7 +868,7 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string m->openOutputFile(outputFasta, outFasta); m->openOutputFile(outputScrapFasta, outScrapFasta); m->openOutputFile(outputMisMatches, outMisMatch); - if (process == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; } + outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes); @@ -981,7 +998,7 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (m->debug) { m->mothurOut(fSeq.getName()); } - if (createGroup) { + if (createOligosGroup) { if(barcodes.size() != 0){ string thisGroup = barcodeNameVector[barcodeIndex]; if (primers.size() != 0) { @@ -1006,6 +1023,15 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string }else { ignore = true; } } + }else if (createFileGroup) { + int pos = group.find("ignore"); + if (pos == string::npos) { + groupMap[fSeq.getName()] = group; + + map::iterator it = groupCounts.find(group); + if (it == groupCounts.end()) { groupCounts[group] = 1; } + else { groupCounts[it->first] ++; } + }else { ignore = true; } } if (m->debug) { m->mothurOut("\n"); } @@ -1479,7 +1505,26 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ if (m->control_pressed) { return files; } in >> forward; m->gobble(in); - in >> reverse; m->gobble(in); + in >> reverse; + + string group = ""; + while (!in.eof()) { //do we have a group assigned to this pair + char c = in.get(); + if (c == 10 || c == 13 || c == -1){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { group += c; } + } + + if (group != "") { + //line in file look like: group forward reverse + string temp = forward; + forward = reverse; + reverse = group; + group = temp; + createFileGroup = true; + } + m->gobble(in); + //check to make sure both are able to be opened ifstream in2; @@ -1545,12 +1590,12 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ }else{ in3.close(); } if ((openForward != 1) && (openReverse != 1)) { //good pair + file2Group[files.size()] = group; vector pair; pair.push_back(forward); pair.push_back(reverse); files.push_back(pair); } - } in.close(); @@ -1604,7 +1649,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, stri if(foligo[i] == 'U') { foligo[i] = 'T'; } } - if(type == "FORWARD"){ + if(type == "PRIMER"){ m->gobble(in); in >> roligo; diff --git a/makecontigscommand.h b/makecontigscommand.h index 2fbc09e..62de6d4 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -59,7 +59,7 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort, allFiles, createGroup, trimOverlap; + bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup; string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format; float match, misMatch, gapOpen, gapExtend; int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq; @@ -75,6 +75,7 @@ private: map groupCounts; map groupMap; + map file2Group; vector convertQual(string); fastqRead readFastq(ifstream&, bool&); @@ -83,8 +84,8 @@ private: vector< vector > readFastqFiles(unsigned long int&, string, string); vector< vector > readFastaFiles(unsigned long int&, string, string); //bool checkReads(fastqRead&, fastqRead&, string, string); - int createProcesses(vector< vector >, string, string, string, vector >); - int driver(vector, string, string, string, vector >, int); + int createProcesses(vector< vector >, string, string, string, vector >, int); + int driver(vector, string, string, string, vector >, int, string); bool getOligos(vector >&, string); string reverseOligo(string); vector getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques); @@ -100,13 +101,13 @@ struct contigsData { string outputFasta; string outputScrapFasta; string outputMisMatches; - string align; + string align, group; vector files; vector > fastaFileNames; MothurOut* m; float match, misMatch, gapOpen, gapExtend; int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq; - bool allFiles, createGroup, done, trimOverlap; + bool allFiles, createOligosGroup, createFileGroup, done, trimOverlap; map groupCounts; map groupMap; vector primerNameVector; @@ -115,7 +116,7 @@ struct contigsData { map primers; contigsData(){} - contigsData(vector f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map br, map pr, vector > ffn, vectorbnv, vector pnv, int pdf, int bdf, int tdf, bool cg, bool all, bool to, int tid) { + contigsData(string g, vector f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map br, map pr, vector > ffn, vectorbnv, vector pnv, int pdf, int bdf, int tdf, bool cg, bool cfg, bool all, bool to, int tid) { files = f; outputFasta = of; outputMisMatches = om; @@ -126,6 +127,7 @@ struct contigsData { gapExtend = gapE; insert = thr; align = al; + group = g; count = 0; outputScrapFasta = osf; fastaFileNames = ffn; @@ -138,7 +140,8 @@ struct contigsData { tdiffs = tdf; allFiles = all; trimOverlap = to; - createGroup = cg; + createOligosGroup = cg; + createFileGroup = cfg; threadID = tid; deltaq = delt; done=false; @@ -189,7 +192,7 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch); pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta); - if (pDataArray->threadID == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; } + outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes); @@ -315,7 +318,7 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ if(trashCode.length() == 0){ bool ignore = false; - if (pDataArray->createGroup) { + if (pDataArray->createOligosGroup) { if(pDataArray->barcodes.size() != 0){ string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; if (pDataArray->primers.size() != 0) { @@ -339,7 +342,17 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ else { pDataArray->groupCounts[it->first] ++; } }else { ignore = true; } } + }else if (pDataArray->createFileGroup) { + int pos = pDataArray->group.find("ignore"); + if (pos == string::npos) { + pDataArray->groupMap[fSeq.getName()] = pDataArray->group; + + map::iterator it = pDataArray->groupCounts.find(pDataArray->group); + if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[pDataArray->group] = 1; } + else { pDataArray->groupCounts[it->first]++; } + }else { ignore = true; } } + if(pDataArray->allFiles && !ignore){ ofstream output; diff --git a/mothurout.cpp b/mothurout.cpp index f7b0d86..3e4db45 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1170,7 +1170,42 @@ int MothurOut::appendFiles(string temp, string filename) { exit(1); } } - +/**************************************************************************************************/ +int MothurOut::appendFilesWithoutHeaders(string temp, string filename) { + try{ + ofstream output; + ifstream input; + + //open output file in append mode + openOutputFileAppend(filename, output); + int ableToOpen = openInputFile(temp, input, "no error"); + //int ableToOpen = openInputFile(temp, input); + + int numLines = 0; + if (ableToOpen == 0) { //you opened it + + string headers = getline(input); gobble(input); + if (debug) { mothurOut("[DEBUG]: skipping headers " + headers +'\n'); } + + char buffer[4096]; + while (!input.eof()) { + input.read(buffer, 4096); + output.write(buffer, input.gcount()); + //count number of lines + for (int i = 0; i < input.gcount(); i++) { if (buffer[i] == '\n') {numLines++;} } + } + input.close(); + } + + output.close(); + + return numLines; + } + catch(exception& e) { + errorOut(e, "MothurOut", "appendFiles"); + exit(1); + } +} /**************************************************************************************************/ string MothurOut::sortFile(string distFile, string outputDir){ try { diff --git a/mothurout.h b/mothurout.h index be657f4..5ff0810 100644 --- a/mothurout.h +++ b/mothurout.h @@ -81,6 +81,7 @@ class MothurOut { vector setFilePosFasta(string, int&); string sortFile(string, string); int appendFiles(string, string); + int appendFilesWithoutHeaders(string, string); int renameFile(string, string); //oldname, newname string getFullPathName(string); string findProgramPath(string programName); diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index ecd6f0c..00897dc 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -23,7 +23,8 @@ vector SeqErrorCommand::setParameters(){ CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(preference); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport","",false,false); parameters.push_back(pqfile); CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport","",false,false); parameters.push_back(preport); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pcount); CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pignorechimeras); CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pthreshold); CommandParameter paligned("aligned", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(paligned); @@ -52,7 +53,8 @@ string SeqErrorCommand::getHelpString(){ helpString += "The reference parameter...\n"; helpString += "The qfile parameter...\n"; helpString += "The report parameter...\n"; - helpString += "The name parameter...\n"; + helpString += "The name parameter allows you to provide a name file associated with the fasta file.\n"; + helpString += "The count parameter allows you to provide a count file associated with the fasta file.\n"; helpString += "The ignorechimeras parameter...\n"; helpString += "The threshold parameter...\n"; helpString += "The processors parameter...\n"; @@ -190,6 +192,14 @@ SeqErrorCommand::SeqErrorCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a names file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } it = parameters.find("qfile"); //user has given a quality score file @@ -227,6 +237,12 @@ SeqErrorCommand::SeqErrorCommand(string option) { if(namesFileName == "not found"){ namesFileName = ""; } else if (namesFileName == "not open") { namesFileName = ""; abort = true; } else { m->setNameFile(namesFileName); } + + //check for optional parameters + countfile = validParameter.validFile(parameters, "count", true); + if(countfile == "not found"){ countfile = ""; } + else if (countfile == "not open") { countfile = ""; abort = true; } + else { m->setCountTableFile(countfile); } qualFileName = validParameter.validFile(parameters, "qfile", true); if(qualFileName == "not found"){ qualFileName = ""; } @@ -243,6 +259,8 @@ SeqErrorCommand::SeqErrorCommand(string option) { outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it } + if ((countfile != "") && (namesFileName != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } @@ -337,7 +355,12 @@ int SeqErrorCommand::execute(){ getReferences(); //read in reference sequences - make sure there's no ambiguous bases - if(namesFileName != ""){ weights = getWeights(); } + if(namesFileName != "") { weights = getWeights(); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + weights = ct.getNameMap(); + } vector fastaFilePos; vector qFilePos; @@ -785,7 +808,7 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, getErrors(query, reference, minCompare); - if(namesFileName != ""){ + if((namesFileName != "") || (countfile != "")){ it = weights.find(query.getName()); minCompare.weight = it->second; } diff --git a/seqerrorcommand.h b/seqerrorcommand.h index 9f400b3..aac17b6 100644 --- a/seqerrorcommand.h +++ b/seqerrorcommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "sequence.hpp" #include "referencedb.h" +#include "counttable.h" class SeqErrorCommand : public Command { @@ -90,7 +91,7 @@ private: int driver(string, string, string, string, string, string, linePair, linePair, linePair); int createProcesses(string, string, string, string, string, string); - string queryFileName, referenceFileName, qualFileName, reportFileName, namesFileName, outputDir; + string queryFileName, referenceFileName, qualFileName, reportFileName, namesFileName, outputDir, countfile; double threshold; bool ignoreChimeras, save, aligned; int numRefs, processors; diff --git a/trimoligos.cpp b/trimoligos.cpp index 91bdb83..5d375a7 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -1,9 +1,9 @@ /* - * trimoligos.cpp - * Mothur + * trimoligos.cpp + * Mothur * - * Created by westcott on 9/1/11. - * Copyright 2011 Schloss Lab. All rights reserved. + * Created by westcott on 9/1/11. + * Copyright 2011 Schloss Lab. All rights reserved. * */ @@ -15,19 +15,18 @@ /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ - try { - m = MothurOut::getInstance(); + try { + m = MothurOut::getInstance(); paired = false; - trashCode = ""; - - pdiffs = p; - bdiffs = b; + + pdiffs = p; + bdiffs = b; ldiffs = l; sdiffs = s; - - barcodes = br; - primers = pr; - revPrimer = r; + + barcodes = br; + primers = pr; + revPrimer = r; linker = lk; spacer = sp; maxFBarcodeLength = 0; @@ -37,7 +36,7 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map::iterator it; + map::iterator it; for(it=primers.begin();it!=primers.end();it++){ if(it->first.length() > maxFPrimerLength){ maxFPrimerLength = it->first.length(); @@ -57,44 +56,43 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, maperrorOut(e, "TrimOligos", "TrimOligos"); - exit(1); - } + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } } /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br){ - try { - m = MothurOut::getInstance(); - - pdiffs = p; - bdiffs = b; + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; ldiffs = l; sdiffs = s; paired = true; - trashCode = ""; - + maxFBarcodeLength = 0; for(map::iterator it=br.begin();it!=br.end();it++){ string forward = it->second.forward; map >::iterator itForward = ifbarcodes.find(forward); - if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); } + if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); } if (itForward == ifbarcodes.end()) { vector temp; temp.push_back(it->first); ifbarcodes[forward] = temp; }else { itForward->second.push_back(it->first); } } - + maxFPrimerLength = 0; for(map::iterator it=pr.begin();it!=pr.end();it++){ string forward = it->second.forward; map >::iterator itForward = ifprimers.find(forward); - if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); } + if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); } if (itForward == ifprimers.end()) { vector temp; temp.push_back(it->first); @@ -107,7 +105,7 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map< string forward = it->second.reverse; map >::iterator itForward = irbarcodes.find(forward); - if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); } + if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); } if (itForward == irbarcodes.end()) { vector temp; temp.push_back(it->first); @@ -120,36 +118,35 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map< string forward = it->second.reverse; map >::iterator itForward = irprimers.find(forward); - if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); } + if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); } if (itForward == irprimers.end()) { vector temp; temp.push_back(it->first); irprimers[forward] = temp; }else { itForward->second.push_back(it->first); } } - + ipbarcodes = br; ipprimers = pr; - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "TrimOligos"); - exit(1); - } + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } } /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, map pr, map br, vector r){ - try { - m = MothurOut::getInstance(); - - pdiffs = p; - bdiffs = b; - - barcodes = br; - primers = pr; - revPrimer = r; + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + + barcodes = br; + primers = pr; + revPrimer = r; paired = false; - trashCode = ""; maxFBarcodeLength = 0; for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ @@ -168,30 +165,29 @@ TrimOligos::TrimOligos(int p, int b, map pr, map br, v } } maxRPrimerLength = maxFPrimerLength; - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "TrimOligos"); - exit(1); - } + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } } /********************************************************************/ TrimOligos::~TrimOligos() {} //********************************************************************/ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - trashCode = ""; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - - if(rawSequence.length() < oligo.length()) { break; } - - //search for primer + try { + + string rawSequence = seq.getUnaligned(); + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + + if(rawSequence.length() < oligo.length()) { break; } + + //search for primer int olength = oligo.length(); for (int j = 0; j < rawSequence.length()-olength; j++){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } string rawChunk = rawSequence.substr(j, olength); if(compareDNASeq(oligo, rawChunk)) { primerStart = j; @@ -200,22 +196,22 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ } } - } - + } + primerStart = 0; primerEnd = 0; //if you don't want to allow for diffs - if (pdiffs == 0) { trashCode = "f"; return false; } - else { //try aligning and see if you can find it - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - + if (pdiffs == 0) { return false; } + else { //try aligning and see if you can find it + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + Alignment* alignment; - if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } - for(map::iterator it=primers.begin();it!=primers.end();it++){ + for(map::iterator it=primers.begin();it!=primers.end();it++){ string prim = it->first; //search for primer @@ -225,26 +221,26 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ for (int j = 0; j < rawSequence.length()-olength; j++){ string oligo = it->first; - - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } - + + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + string rawChunk = rawSequence.substr(j, olength+pdiffs); //use needleman to align first primer.length()+numdiffs of sequence to each barcode alignment->alignPrimer(oligo, rawChunk); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); - + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - + int numDiff = countDiffs(oligo, temp); - + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; @@ -253,39 +249,38 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ }else if(numDiff == minDiff){ minCount++; } } } - } - - if (alignment != NULL) { delete alignment; } + } + + if (alignment != NULL) { delete alignment; } - if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //no good matches - else if(minCount > 1) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //can't tell the difference between multiple primers - else{ trashCode = ""; return true; } - } + if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; return false; } //no good matches + else if(minCount > 1) { primerStart = 0; primerEnd = 0; return false; } //can't tell the difference between multiple primers + else{ return true; } + } - trashCode = "f"; primerStart = 0; primerEnd = 0; - return false; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } + return false; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } } //******************************************************************/ bool TrimOligos::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ - try { - trashCode = ""; - string rawSequence = seq.getUnaligned(); - - for(int i=0;i= 0; j--){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } string rawChunk = rawSequence.substr(j, olength); if(compareDNASeq(oligo, rawChunk)) { @@ -295,253 +290,239 @@ bool TrimOligos::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ } } - } - - trashCode = "r"; + } + primerStart = 0; primerEnd = 0; - return false; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "findReverse"); - exit(1); - } + return false; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "findReverse"); + exit(1); + } } //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { - trashCode = ""; - if (paired) { int success = stripPairedBarcode(seq, qual, group); return success; } - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - - success = 0; - trashCode = ""; - return success; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if (bdiffs == 0) { trashCode = "b"; return success; } - - else { //try aligning and see if you can find it - Alignment* alignment; - if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; trashCode = "b"; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + try { + + if (paired) { int success = stripPairedBarcode(seq, qual, group); return success; } + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { trashCode = "b"; success = minDiff; } //no good matches - else if(minCount > 1) { trashCode = "b"; success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - trashCode = ""; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripBarcode"); - exit(1); - } + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripBarcode"); + exit(1); + } } //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& group){ - try { - trashCode = ""; - //look for forward barcode - string rawFSequence = forwardSeq.getUnaligned(); + try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the forward barcode - for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ - string foligo = it->second.forward; + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the forward barcode + for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; - if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode += "b"; + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out break; - } + } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode += "d"; - break; - } - - if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) { - if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); - reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); - success = 0; trashCode = ""; - return success; - } - }else { - trashCode = "b"; - if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d"; } - } - } - - //if you found the barcode or if you don't want to allow for diffs - if (bdiffs == 0) { return success; } - else { //try aligning and see if you can find it - trashCode = ""; - Alignment* alignment; - if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - - only want to look for forward = Sarah, John, Anna, Pat, Gail - reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore + only want to look for forward = Sarah, John, Anna, Pat, Gail + reverse = Westcott, Schloss, Brown, Moore + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ - //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ - string oligo = it->first; - - if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ + string oligo = it->first; + + if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = bdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches - //else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode minDiff = 1e6; @@ -582,7 +563,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr tempminRPos++; } } - minRPos.push_back(tempminRPos); + minRPos.push_back(tempminRPos); }else if(numDiff == minDiff){ int tempminRPos = 0; for(int i=0;isecond); } } /*cout << minDiff << '\t' << minCount << '\t' << endl; - for (int i = 0; i < minFGroup.size(); i++) { - cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } - cout << endl; - } - cout << endl; - for (int i = 0; i < minRGroup.size(); i++) { - cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } - cout << endl; - } - cout << endl;*/ - if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches - else { + for (int i = 0; i < minFGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + cout << endl; + } + cout << endl; + for (int i = 0; i < minRGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + cout << endl; + } + cout << endl;*/ + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -623,9 +604,9 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr int fStart = 0; int rStart = 0; - if (matches.size() == 1) { + if (matches.size() == 1) { foundMatch = true; - group = matches[0]; + group = matches[0]; fStart = minFPos[0]; rStart = minRPos[0]; } @@ -635,154 +616,138 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr forwardSeq.setUnaligned(rawFSequence.substr(fStart)); reverseSeq.setUnaligned(rawRSequence.substr(rStart)); success = minDiff; - }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } - } - //} - - if (alignment != NULL) { delete alignment; } - } - - //catchall for case I didn't think of - if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripIBarcode"); - exit(1); - } - + }else { success = bdiffs + 100; } + } + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripIBarcode"); + exit(1); + } + } //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ - try { - trashCode = ""; - - //look for forward barcode - string rawFSequence = forwardSeq.getUnaligned(); + try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the forward barcode - for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ - string foligo = it->second.forward; + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the forward barcode + for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; - if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode = "b"; - break; - } + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode += "d"; - break; - } - - if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) { - if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); - reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); - forwardQual.trimQScores(foligo.length(), -1); - reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); - success = 0; trashCode = ""; - return success; - } - }else { - trashCode = "b"; - if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d"; - } - } - } - - //if you found the barcode or if you don't want to allow for diffs - if (bdiffs == 0) { return success; } - else { //try aligning and see if you can find it - trashCode = ""; - Alignment* alignment; - if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + forwardQual.trimQScores(foligo.length(), -1); + reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore only want to look for forward = Sarah, John, Anna, Pat, Gail reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ - //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ - string oligo = it->first; - - if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ + string oligo = it->first; + + if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = bdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches - //else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode minDiff = 1e6; @@ -823,7 +788,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality tempminRPos++; } } - minRPos.push_back(tempminRPos); + minRPos.push_back(tempminRPos); }else if(numDiff == minDiff){ int tempminRPos = 0; for(int i=0;isecond); } } /*cout << minDiff << '\t' << minCount << '\t' << endl; - for (int i = 0; i < minFGroup.size(); i++) { + for (int i = 0; i < minFGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } cout << endl; } cout << endl; - for (int i = 0; i < minRGroup.size(); i++) { + for (int i = 0; i < minRGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } cout << endl; } cout << endl;*/ - if(minDiff > bdiffs) { trashCode = "d"; success = minDiff; } //no good matches - else { + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -864,9 +829,9 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality int fStart = 0; int rStart = 0; - if (matches.size() == 1) { + if (matches.size() == 1) { foundMatch = true; - group = matches[0]; + group = matches[0]; fStart = minFPos[0]; rStart = minRPos[0]; } @@ -878,154 +843,139 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality forwardQual.trimQScores(fStart, -1); reverseQual.trimQScores(rStart, -1); success = minDiff; - }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } - } - //} - - if (alignment != NULL) { delete alignment; } - } - - //catchall for case I didn't think of - if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripIBarcode"); - exit(1); - } - + }else { success = bdiffs + 100; } + } + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripIBarcode"); + exit(1); + } + } //*******************************************************************/ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { - trashCode = ""; - - //look for forward barcode - string rawSeq = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - //cout << seq.getName() << endl; - //can you find the forward barcode - for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ - string foligo = it->second.forward; + try { + //look for forward barcode + string rawSeq = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + //cout << seq.getName() << endl; + //can you find the forward barcode + for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; //cout << it->first << '\t' << foligo << '\t' << roligo << endl; //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl; - if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode = "b"; - break; - } + if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode += "d"; - break; - } - - if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward - if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse - group = it->first; - string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode - seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode - if(qual.getName() != ""){ - qual.trimQScores(-1, rawSeq.length()-roligo.length()); - qual.trimQScores(foligo.length(), -1); - } - success = 0; - trashCode = ""; - return success; - } - }else { - trashCode = "b"; - if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; } - } - } - //cout << "success=" << success << endl; - //if you found the barcode or if you don't want to allow for diffs - if (bdiffs == 0) { return success; } - else { //try aligning and see if you can find it - Alignment* alignment; - if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) { + group = it->first; + string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode + seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSeq.length()-roligo.length()); + qual.trimQScores(foligo.length(), -1); + } + success = 0; + break; + } + } + //cout << "success=" << success << endl; + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore only want to look for forward = Sarah, John, Anna, Pat, Gail reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ - string oligo = it->first; - - if(rawSeq.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ + string oligo = it->first; + + if(rawSeq.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = bdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches - //else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } else{ alignment = NULL; } //can you find the barcode @@ -1035,7 +985,7 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou vector minRPos; string rawRSequence = reverseOligo(seq.getUnaligned()); - //cout << irbarcodes.size() << '\t' << maxRBarcodeLength << endl; + //cout << irbarcodes.size() << '\t' << maxRBarcodeLength << endl; for(map >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){ string oligo = reverseOligo(it->first); //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+bdiffs)) << endl; @@ -1086,18 +1036,18 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou /*cout << minDiff << '\t' << minCount << '\t' << endl; for (int i = 0; i < minFGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } cout << endl; } cout << endl; for (int i = 0; i < minRGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } cout << endl; } cout << endl;*/ - if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches - else { + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -1127,155 +1077,142 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou } success = minDiff; //cout << "barcode = " << ipbarcodes[group].forward << '\t' << ipbarcodes[group].reverse << endl; - }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } - } - //} - - if (alignment != NULL) { delete alignment; } - } - - //catchall for case I didn't think of - if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripPairedBarcode"); - exit(1); - } - + }else { success = bdiffs + 100; } + } + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripPairedBarcode"); + exit(1); + } + } //*******************************************************************/ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ - try { - trashCode = ""; - //look for forward - string rawSeq = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent + try { + //look for forward + string rawSeq = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent //cout << seq.getName() << endl; - //can you find the forward - for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ - string foligo = it->second.forward; + //can you find the forward + for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; //cout << it->first << '\t' << foligo << '\t' << roligo << endl; //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl; - if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode = "p"; - break; - } + if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - trashCode += "q"; - break; - } - - if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward - if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse - group = it->first; + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) { + group = it->first; + if (!keepForward) { string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode if(qual.getName() != ""){ qual.trimQScores(-1, rawSeq.length()-roligo.length()); qual.trimQScores(foligo.length(), -1); } - success = 0; - trashCode = ""; - return success; - } - }else { - trashCode = "b"; - if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; } - } - + } + success = 0; + break; + } } - //cout << "success=" << success << endl; - //if you found the barcode or if you don't want to allow for diffs - if (pdiffs == 0) { return success; } - else { //try aligning and see if you can find it - Alignment* alignment; - if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + //cout << "success=" << success << endl; + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore only want to look for forward = Sarah, John, Anna, Pat, Gail reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ - string oligo = it->first; - - if(rawSeq.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ + string oligo = it->first; + + if(rawSeq.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i pdiffs) { trashCode = "p"; minFGroup.clear(); success = minDiff; } //no good matches - //else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } + if (irprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } else{ alignment = NULL; } //can you find the barcode @@ -1336,18 +1273,18 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou /*cout << minDiff << '\t' << minCount << '\t' << endl; for (int i = 0; i < minFGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } cout << endl; } cout << endl; for (int i = 0; i < minRGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } cout << endl; } cout << endl;*/ - if(minDiff > pdiffs) { trashCode += "q"; success = minDiff; } //no good matches - else { + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -1379,144 +1316,139 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou } success = minDiff; //cout << "primer = " << ipprimers[group].forward << '\t' << ipprimers[group].reverse << endl; - }else { if (trashCode == "") { trashCode = "pq"; } success = pdiffs + 100; } - } - //} - - if (alignment != NULL) { delete alignment; } - } - - //catchall for case I didn't think of - if ((trashCode == "") && (success > pdiffs)) { trashCode = "pq"; } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripPairedPrimers"); - exit(1); - } - -} - + }else { success = pdiffs + 100; } + } + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripPairedPrimers"); + exit(1); + } + +} + //*******************************************************************/ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ - try { - //look for forward barcode - string rawFSequence = forwardSeq.getUnaligned(); + try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the forward barcode - for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ - string foligo = it->second.forward; + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the forward barcode + for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; - if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); forwardQual.trimQScores(foligo.length(), -1); reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - else { //try aligning and see if you can find it - Alignment* alignment; - if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore only want to look for forward = Sarah, John, Anna, Pat, Gail reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ - //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ - string oligo = it->first; - - if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ + string oligo = it->first; + + if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode minDiff = 1e6; @@ -1557,7 +1489,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality tempminRPos++; } } - minRPos.push_back(tempminRPos); + minRPos.push_back(tempminRPos); }else if(numDiff == minDiff){ int tempminRPos = 0; for(int i=0;isecond); } } /*cout << minDiff << '\t' << minCount << '\t' << endl; - for (int i = 0; i < minFGroup.size(); i++) { + for (int i = 0; i < minFGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } cout << endl; } cout << endl; - for (int i = 0; i < minRGroup.size(); i++) { + for (int i = 0; i < minRGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } cout << endl; } cout << endl;*/ - if(minDiff > pdiffs) { success = minDiff; } //no good matches - else { + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -1598,9 +1530,9 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality int fStart = 0; int rStart = 0; - if (matches.size() == 1) { + if (matches.size() == 1) { foundMatch = true; - group = matches[0]; + group = matches[0]; fStart = minFPos[0]; rStart = minRPos[0]; } @@ -1614,136 +1546,134 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality success = minDiff; }else { success = pdiffs + 100; } } - } - - if (alignment != NULL) { delete alignment; } - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripIForward"); - exit(1); - } - + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripIForward"); + exit(1); + } + } //*******************************************************************/ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& group){ - try { - //look for forward barcode - string rawFSequence = forwardSeq.getUnaligned(); + try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the forward barcode - for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ - string foligo = it->second.forward; + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the forward barcode + for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ + string foligo = it->second.forward; string roligo = it->second.reverse; - if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - else { //try aligning and see if you can find it - Alignment* alignment; - if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - vector< vector > minFGroup; - vector minFPos; + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; //the pair can have duplicates, but we only want to search for the unique forward and reverses, example /* - 1 Sarah Westcott - 2 John Westcott - 3 Anna Westcott - 4 Sarah Schloss - 5 Pat Schloss - 6 Gail Brown - 7 Pat Moore - + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore only want to look for forward = Sarah, John, Anna, Pat, Gail reverse = Westcott, Schloss, Brown, Moore - - but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. */ - //cout << endl << forwardSeq.getName() << endl; - for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ - string oligo = it->first; - - if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length - success = pdiffs + 10; - break; - } - //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ + string oligo = it->first; + + if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); - + if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minFGroup.clear(); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); minFGroup.push_back(it->second); - int tempminFPos = 0; + int tempminFPos = 0; minFPos.clear(); - for(int i=0;isecond); + }else if(numDiff == minDiff){ + minFGroup.push_back(it->second); int tempminFPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else{ + //cout << minDiff << '\t' << minCount << '\t' << endl; + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else{ //check for reverse match - if (alignment != NULL) { delete alignment; } + if (alignment != NULL) { delete alignment; } - if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } //can you find the barcode minDiff = 1e6; @@ -1784,7 +1714,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr tempminRPos++; } } - minRPos.push_back(tempminRPos); + minRPos.push_back(tempminRPos); }else if(numDiff == minDiff){ int tempminRPos = 0; for(int i=0;isecond); } } /*cout << minDiff << '\t' << minCount << '\t' << endl; - for (int i = 0; i < minFGroup.size(); i++) { + for (int i = 0; i < minFGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } cout << endl; } cout << endl; - for (int i = 0; i < minRGroup.size(); i++) { + for (int i = 0; i < minRGroup.size(); i++) { cout << i << '\t'; - for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } cout << endl; } cout << endl;*/ - if(minDiff > pdiffs) { success = minDiff; } //no good matches - else { + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else { bool foundMatch = false; vector matches; for (int i = 0; i < minFGroup.size(); i++) { @@ -1825,9 +1755,9 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr int fStart = 0; int rStart = 0; - if (matches.size() == 1) { + if (matches.size() == 1) { foundMatch = true; - group = matches[0]; + group = matches[0]; fStart = minFPos[0]; rStart = minRPos[0]; } @@ -1839,856 +1769,856 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr success = minDiff; }else { success = pdiffs + 100; } } - } - - if (alignment != NULL) { delete alignment; } - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripIForward"); - exit(1); - } - + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripIForward"); + exit(1); + } + } //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& seq, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it Alignment* alignment; - if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripBarcode"); - exit(1); - } - + if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripBarcode"); + exit(1); + } + } /********************************************************************/ int TrimOligos::stripForward(Sequence& seq, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - Alignment* alignment; - if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxFPrimerLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } + try { + + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxFPrimerLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } } //*******************************************************************/ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ - try { - - if (paired) { int success = stripPairedPrimers(seq, qual, group, keepForward); return success; } - - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); } - if(qual.getName() != ""){ - if (!keepForward) { qual.trimQScores(oligo.length(), -1); } - } - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - Alignment* alignment; - if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxFPrimerLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); } - if(qual.getName() != ""){ - if (!keepForward) { qual.trimQScores(minPos, -1); } - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } + try { + + if (paired) { int success = stripPairedPrimers(seq, qual, group, keepForward); return success; } + + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); } + if(qual.getName() != ""){ + if (!keepForward) { qual.trimQScores(oligo.length(), -1); } + } + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxFPrimerLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); } + if(qual.getName() != ""){ + if (!keepForward) { qual.trimQScores(minPos, -1); } + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); - exit(1); - } + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripReverse(Sequence& seq){ - try { - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); - exit(1); - } + try { + + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = ldiffs + 1; //guilty until proven innocent - - for(int i=0;i 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minPos = 0; - - for(int i = 0; i < linker.size(); i++){ - string oligo = linker[i]; - // int length = oligo.length(); - - if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length - success = ldiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minPos = 0; - for(int i=0;i ldiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripLinker"); - exit(1); - } + if ((ldiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (linker.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripLinker(Sequence& seq){ - try { - - string rawSequence = seq.getUnaligned(); - bool success = ldiffs +1; //guilty until proven innocent - - for(int i=0;i 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minPos = 0; - - for(int i = 0; i < linker.size(); i++){ - string oligo = linker[i]; - // int length = oligo.length(); - - if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length - success = ldiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minPos = 0; - for(int i=0;i ldiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripLinker"); - exit(1); - } + if ((ldiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (linker.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = sdiffs+1; //guilty until proven innocent - - for(int i=0;i 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minPos = 0; - - for(int i = 0; i < spacer.size(); i++){ - string oligo = spacer[i]; - // int length = oligo.length(); - - if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length - success = sdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minPos = 0; - for(int i=0;i sdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } + if ((sdiffs == 0) || (success == 0)) { return success; } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripSpacer"); - exit(1); - } + else { //try aligning and see if you can find it + Alignment* alignment; + if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } } //******************************************************************/ bool TrimOligos::stripSpacer(Sequence& seq){ - try { - - string rawSequence = seq.getUnaligned(); - bool success = sdiffs+1; //guilty until proven innocent - - for(int i=0;i 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } - else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minPos = 0; - - for(int i = 0; i < spacer.size(); i++){ - string oligo = spacer[i]; - // int length = oligo.length(); - - if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length - success = sdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minPos = 0; - for(int i=0;i sdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripSpacer"); - exit(1); - } + if ((sdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + Alignment* alignment; + if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } } //******************************************************************/ bool TrimOligos::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimOligos", "compareDNASeq"); - exit(1); - } - + try { + bool success = 1; + int length = oligo.length(); + + for(int i=0;ierrorOut(e, "TrimOligos", "compareDNASeq"); + exit(1); + } + } //********************************************************************/ int TrimOligos::countDiffs(string oligo, string seq){ - try { - - int length = oligo.length(); - int countDiffs = 0; - - for(int i=0;ierrorOut(e, "TrimOligos", "countDiffs"); - exit(1); - } + try { + + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;ierrorOut(e, "TrimOligos", "countDiffs"); + exit(1); + } } //********************************************************************/ string TrimOligos::reverseOligo(string oligo){ - try { + try { string reverse = ""; for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] == 'A') { reverse += 'T'; } + if(oligo[i] == 'A') { reverse += 'T'; } else if(oligo[i] == 'T'){ reverse += 'A'; } else if(oligo[i] == 'U'){ reverse += 'A'; } @@ -2710,20 +2640,19 @@ string TrimOligos::reverseOligo(string oligo){ else if(oligo[i] == 'D'){ reverse += 'H'; } else if(oligo[i] == 'H'){ reverse += 'D'; } - else if(oligo[i] == '-'){ reverse += '-'; } - else { reverse += 'N'; } + else if(oligo[i] == '-'){ reverse += '-'; } + else { reverse += 'N'; } } return reverse; } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "reverseOligo"); - exit(1); - } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "reverseOligo"); + exit(1); + } } /********************************************************************/ - diff --git a/trimoligos.h b/trimoligos.h index f3e6954..a86eb9a 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -56,13 +56,10 @@ class TrimOligos { bool findReverse(Sequence&, int&, int&); string reverseOligo(string); - string getTrashCode() { return trashCode; } - private: int pdiffs, bdiffs, ldiffs, sdiffs; bool paired; - string trashCode; map barcodes; map primers; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 935b9a9..7140b56 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -443,7 +443,9 @@ int TrimSeqsCommand::execute(){ outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } - + + if (!pairedOligos) { if (reorient) { m->mothurOut("[WARNING]: You cannot use reorient without paired barcodes or primers, skipping."); m->mothurOutEndLine(); reorient = false; } } + if (m->control_pressed) { return 0; } //fills lines and qlines @@ -738,7 +740,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numBarcodes != 0){ success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex); - if(success > bdiffs) { trashCode += 'b'; } + if(success > bdiffs) { + trashCode += 'b'; + } else{ currentSeqsDiffs += success; } } @@ -752,9 +756,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numFPrimers != 0){ success = trimOligos->stripForward(currSeq, currQual, primerIndex, keepforward); if(success > pdiffs) { - //if (pairedOligos) { trashCode += trimOligos->getTrashCode(); } - //else { trashCode += 'f'; } - trashCode += 'f'; + trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -776,17 +778,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numBarcodes != 0){ thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex); - if(thisSuccess > bdiffs) { thisTrashCode += 'b'; } + if(thisSuccess > bdiffs) { thisTrashCode += "b"; } else{ thisCurrentSeqsDiffs += thisSuccess; } } if(numFPrimers != 0){ thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, keepforward); - if(thisSuccess > pdiffs) { - //if (pairedOligos) { thisTrashCode += rtrimOligos->getTrashCode(); } - //else { thisTrashCode += 'f'; } - thisTrashCode += 'f'; - } + if(thisSuccess > pdiffs) { thisTrashCode += "f"; } else{ thisCurrentSeqsDiffs += thisSuccess; } } @@ -804,7 +802,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string savedQual.flipQScores(); currQual.setScores(savedQual.getScores()); } - } + }else { trashCode += "(" + thisTrashCode + ")"; } } if(keepFirst != 0){ @@ -1653,7 +1651,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; } - + //add in potential combos if(barcodeNameVector.size() == 0){ barcodes[""] = 0; diff --git a/trimseqscommand.h b/trimseqscommand.h index de3d839..53f04d3 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -381,7 +381,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ savedQual.flipQScores(); currQual.setScores(savedQual.getScores()); } - } + }else { trashCode += "(" + thisTrashCode + ")"; } }