X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=20002049ce986b01f5395ed2b807471cf954133f;hb=bf83aa09a52403a2faf1a1f82489d1e0c8c32283;hp=1989365a5a2a653d54c6b4258f65deaceac079c6;hpb=02909d6cae9963ba00dc746969a370fa8ca934fc;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 1989365..2000204 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -8,20 +8,86 @@ */ #include "trimseqscommand.h" +#include "needlemanoverlap.hpp" + +//********************************************************************************************************************** + +vector TrimSeqsCommand::getValidParameters(){ + try { + string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getValidParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +TrimSeqsCommand::TrimSeqsCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["qual"] = tempOutNames; + outputTypes["group"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimSeqsCommand::getRequiredParameters(){ + try { + string Array[] = {"fasta"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimSeqsCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles"); + exit(1); + } +} //*************************************************************************************************************** -TrimSeqsCommand::TrimSeqsCommand(string option){ +TrimSeqsCommand::TrimSeqsCommand(string option) { try { abort = false; + comboStarts = 0; //allow user to run help if(option == "help") { help(); abort = true; } else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles"}; + string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -29,30 +95,87 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ map parameters = parser.getParameters(); ValidParameters validParameter; + map::iterator it; //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + for (it = parameters.begin(); it != parameters.end(); it++) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["qual"] = tempOutNames; + outputTypes["group"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + } + + //check for required parameters fastaFile = validParameter.validFile(parameters, "fasta", true); - if (fastaFile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; } + if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; } else if (fastaFile == "not open") { abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it + } - + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ flip = 0; } - else if(isTrue(temp)) { flip = 1; } + else if(m->isTrue(temp)) { flip = 1; } temp = validParameter.validFile(parameters, "oligos", true); if (temp == "not found"){ oligoFile = ""; } else if(temp == "not open"){ abort = true; } else { oligoFile = temp; } + temp = validParameter.validFile(parameters, "group", true); + if (temp == "not found"){ groupfile = ""; } + else if(temp == "not open"){ abort = true; } + else { groupfile = temp; } + temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } convert(temp, maxAmbig); @@ -65,72 +188,115 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } + convert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } - else if(temp == "not open") { abort = 0; } + else if(temp == "not open") { abort = true; } else { qFileName = temp; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } convert(temp, qThreshold); + + temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } + qtrim = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; } + convert(temp, qRollAverage); + + temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; } + convert(temp, qWindowAverage); + + temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; } + convert(temp, qWindowSize); + + temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; } + convert(temp, qWindowStep); temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); + + temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; } + convert(temp, keepFirst); + + temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; } + convert(temp, removeLast); temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } - allFiles = isTrue(temp); + allFiles = m->isTrue(temp); - if(allFiles && oligoFile == ""){ - cout << "You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request." << endl; + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } + convert(temp, processors); + + if ((oligoFile != "") && (groupfile != "")) { + m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true; + } + + + if(allFiles && (oligoFile == "") && (groupfile == "")){ + m->mothurOut("You selected allfiles, but didn't enter an oligos or group file. Ignoring the allfiles request."); m->mothurOutEndLine(); } if((qAverage != 0 && qThreshold != 0) && qFileName == ""){ - cout << "You didn't provide a quality file name, quality criteria will be ignored." << endl; + m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine(); qAverage=0; qThreshold=0; } if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){ - cout << "You didn't set any options... quiting command." << endl; + m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } } } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } + //********************************************************************************************************************** void TrimSeqsCommand::help(){ try { - cout << "The trim.seqs command reads a fastaFile and creates ....." << "\n"; - cout << "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength and maxlength." << "\n"; - cout << "The fasta parameter is required." << "\n"; - cout << "The flip parameter .... The default is 0." << "\n"; - cout << "The oligos parameter .... The default is ""." << "\n"; - cout << "The maxambig parameter .... The default is -1." << "\n"; - cout << "The maxhomop parameter .... The default is 0." << "\n"; - cout << "The minlength parameter .... The default is 0." << "\n"; - cout << "The maxlength parameter .... The default is 0." << "\n"; - cout << "The trim.seqs command should be in the following format: " << "\n"; - cout << "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, " << "\n"; - cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) " << "\n"; - cout << "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n"; - cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n"; + m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); + m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); + m->mothurOut("The fasta parameter is required.\n"); + m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n"); + m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); + m->mothurOut("The oligos parameter .... The default is "".\n"); + m->mothurOut("The maxambig parameter .... The default is -1.\n"); + m->mothurOut("The maxhomop parameter .... The default is 0.\n"); + m->mothurOut("The minlength parameter .... The default is 0.\n"); + m->mothurOut("The maxlength parameter .... The default is 0.\n"); + m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"); + m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"); + m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"); + m->mothurOut("The qfile parameter .....\n"); + m->mothurOut("The qthreshold parameter .... The default is 0.\n"); + m->mothurOut("The qaverage parameter .... The default is 0.\n"); + m->mothurOut("The allfiles parameter .... The default is F.\n"); + m->mothurOut("The qtrim parameter .... The default is F.\n"); + m->mothurOut("The trim.seqs command should be in the following format: \n"); + m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); + m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); + m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "help"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } @@ -144,389 +310,1174 @@ int TrimSeqsCommand::execute(){ try{ if (abort == true) { return 0; } + + numFPrimers = 0; //this needs to be initialized + numRPrimers = 0; + vector fastaFileNames; + vector qualFileNames; + + string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); + string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); + string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; + string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; + if (qFileName != "") { outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile); outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); } + string groupFile = ""; + if (groupfile == "") { groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; } + else{ + groupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "trim.groups"; + outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile); + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + + if(allFiles){ + for (int i = 0; i < groupMap->namesOfGroups.size(); i++) { + groupToIndex[groupMap->namesOfGroups[i]] = i; + groupVector.push_back(groupMap->namesOfGroups[i]); + fastaFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupMap->namesOfGroups[i] + ".fasta")); + + //we append later, so we want to clear file + ofstream outRemove; + m->openOutputFile(fastaFileNames[i], outRemove); + outRemove.close(); + if(qFileName != ""){ + qualFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupMap->namesOfGroups[i] + ".qual")); + ofstream outRemove2; + m->openOutputFile(qualFileNames[i], outRemove2); + outRemove2.close(); + } + } + } + comboStarts = fastaFileNames.size()-1; + } + + if(oligoFile != ""){ + outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile); + getOligos(fastaFileNames, qualFileNames); + } - ifstream inFASTA; - openInputFile(fastaFile, inFASTA); + vector fastaFilePos; + vector qFilePos; + + setLines(fastaFile, qFileName, fastaFilePos, qFilePos); + + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)])); + if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); } + } + if(qFileName == "") { qLines = lines; } //files with duds + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); + }else{ + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); + } + #else + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); + #endif + + if (m->control_pressed) { return 0; } + + set blanks; + for(int i=0;iisBlank(fastaFileNames[i])) { blanks.insert(fastaFileNames[i]); } + else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); } + else { + ifstream inFASTA; + string seqName; + m->openInputFile(fastaFileNames[i], inFASTA); + ofstream outGroups; + string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups"; + + //if the fastafile is on the blanks list then the groups file should be as well + if (blanks.count(fastaFileNames[i]) != 0) { blanks.insert(outGroupFilename); } + + m->openOutputFile(outGroupFilename, outGroups); + outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename); + + string thisGroup = ""; + if (i > comboStarts) { + map::iterator itCombo; + for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ + if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } + } + }else{ thisGroup = groupVector[i]; } + + while(!inFASTA.eof()){ + if(inFASTA.get() == '>'){ + inFASTA >> seqName; + outGroups << seqName << '\t' << thisGroup << endl; + } + while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + } + outGroups.close(); + inFASTA.close(); + } + } + + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } + + blanks.clear(); + if(qFileName != ""){ + for(int i=0;iisBlank(qualFileNames[i])) { blanks.insert(qualFileNames[i]); } + else if (filesToRemove.count(qualFileNames[i]) > 0) { remove(qualFileNames[i].c_str()); } + } + } + + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } + + if (m->control_pressed) { + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "execute"); + exit(1); + } +} + +/**************************************************************************************/ + +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector fastaNames, vector qualNames, linePair* line, linePair* qline) { + + try { ofstream outFASTA; - string trimSeqFile = getRootName(fastaFile) + "trim.fasta"; - openOutputFile(trimSeqFile, outFASTA); + int able = m->openOutputFile(trimFile, outFASTA); + + ofstream scrapFASTA; + m->openOutputFile(scrapFile, scrapFASTA); + + ofstream outQual; + ofstream scrapQual; + if(qFileName != ""){ + m->openOutputFile(trimQFile, outQual); + m->openOutputFile(scrapQFile, scrapQual); + } ofstream outGroups; - vector fastaFileNames; - if(oligoFile != ""){ - string groupFile = getRootName(fastaFile) + "groups"; - openOutputFile(groupFile, outGroups); - getOligos(fastaFileNames); + + if (oligoFile != "") { + m->openOutputFile(groupFile, outGroups); } - ofstream scrapFASTA; - string scrapSeqFile = getRootName(fastaFile) + "scrap.fasta"; - openOutputFile(scrapSeqFile, scrapFASTA); + ifstream inFASTA; + m->openInputFile(filename, inFASTA); + inFASTA.seekg(line->start); ifstream qFile; - if(qFileName != "") { openInputFile(qFileName, qFile); } + if(qFileName != "") { m->openInputFile(qFileName, qFile); qFile.seekg(qline->start); } - bool success; - while(!inFASTA.eof()){ - Sequence currSeq(inFASTA); - string origSeq = currSeq.getUnaligned(); - int group; - string trashCode = ""; + for (int i = 0; i < fastaNames.size(); i++) { //clears old file + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + } + for (int i = 0; i < qualNames.size(); i++) { //clears old file + ofstream temp; + m->openOutputFile(qualNames[i], temp); + temp.close(); + } + - if(qFileName != ""){ - if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } - else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - if(!success) { trashCode += 'q'; } - qFile.close(); - } - if(barcodes.size() != 0){ - success = stripBarcode(currSeq, group); - if(!success){ trashCode += 'b'; } - } - if(numFPrimers != 0){ - success = stripForward(currSeq); - if(!success){ trashCode += 'f'; } - } - if(numRPrimers != 0){ - success = stripReverse(currSeq); - if(!success){ trashCode += 'r'; } - } - if(minLength > 0 || maxLength > 0){ - success = cullLength(currSeq); - if(!success){ trashCode += 'l'; } - } - if(maxHomoP > 0){ - success = cullHomoP(currSeq); - if(!success){ trashCode += 'h'; } - } - if(maxAmbig != -1){ - success = cullAmbigs(currSeq); - if(!success){ trashCode += 'n'; } + bool done = false; + int count = 0; + + while (!done) { + + if (m->control_pressed) { + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); + if (oligoFile != "") { outGroups.close(); } + + if(qFileName != ""){ + qFile.close(); + } + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + + return 0; } - if(flip){ currSeq.reverseComplement(); } // should go last + int success = 1; + + + Sequence currSeq(inFASTA); m->gobble(inFASTA); + + QualityScores currQual; + if(qFileName != ""){ + currQual = QualityScores(qFile, currSeq.getNumBases()); m->gobble(qFile); + } - if(trashCode.length() == 0){ - currSeq.printSequence(outFASTA); + string origSeq = currSeq.getUnaligned(); + if (origSeq != "") { + int groupBar, groupPrime; + string trashCode = ""; + int currentSeqsDiffs = 0; + if(barcodes.size() != 0){ - outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; + success = stripBarcode(currSeq, currQual, groupBar); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(numFPrimers != 0){ + success = stripForward(currSeq, currQual, groupPrime); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + + if(numRPrimers != 0){ + success = stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + if(keepFirst != 0){ + success = keepFirstTrim(currSeq, currQual); + } + + if(removeLast != 0){ + success = removeLastTrim(currSeq, currQual); + if(!success) { trashCode += 'l'; } + } + + + if(qFileName != ""){ + + if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } + else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } + else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } + else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } + else { success = 1; } + +// if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { +// success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap +// } + + if(!success) { trashCode += 'q'; } + } + + if(minLength > 0 || maxLength > 0){ + success = cullLength(currSeq); + if(!success) { trashCode += 'l'; } + } + if(maxHomoP > 0){ + success = cullHomoP(currSeq); + if(!success) { trashCode += 'h'; } + } + if(maxAmbig != -1){ + success = cullAmbigs(currSeq); + if(!success) { trashCode += 'n'; } + } + + if(flip){ // should go last + currSeq.reverseComplement(); + currQual.flipQScores(); + } + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); + currSeq.printSequence(outFASTA); + currQual.printQScores(outQual); + + if(barcodes.size() != 0){ + string thisGroup = groupVector[groupBar]; + int indexToFastaFile = groupBar; + if (primers.size() != 0){ + //does this primer have a group + if (groupVector[groupPrime] != "") { + thisGroup += "." + groupVector[groupPrime]; + indexToFastaFile = combos[thisGroup]; + } + } + outGroups << currSeq.getName() << '\t' << thisGroup << endl; + if(allFiles){ + ofstream outTemp; + m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp); + //currSeq.printSequence(*fastaFileNames[indexToFastaFile]); + currSeq.printSequence(outTemp); + outTemp.close(); + + if(qFileName != ""){ + //currQual.printQScores(*qualFileNames[indexToFastaFile]); + ofstream outTemp2; + m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2); + currQual.printQScores(outTemp2); + outTemp2.close(); + } + } + } - if(allFiles){ - currSeq.printSequence(*fastaFileNames[group]); + if (groupfile != "") { + string thisGroup = groupMap->getGroup(currSeq.getName()); + + if (thisGroup != "not found") { + outGroups << currSeq.getName() << '\t' << thisGroup << endl; + if (allFiles) { + ofstream outTemp; + m->openOutputFileAppend(fastaNames[groupToIndex[thisGroup]], outTemp); + currSeq.printSequence(outTemp); + outTemp.close(); + if(qFileName != ""){ + ofstream outTemp2; + m->openOutputFileAppend(qualNames[groupToIndex[thisGroup]], outTemp2); + currQual.printQScores(outTemp2); + outTemp2.close(); + } + } + }else{ + m->mothurOut(currSeq.getName() + " is not in your groupfile, adding to group XXX."); m->mothurOutEndLine(); + outGroups << currSeq.getName() << '\t' << "XXX" << endl; + if (allFiles) { + m->mothurOut("[ERROR]: " + currSeq.getName() + " will not be added to any .group.fasta or .group.qual file."); m->mothurOutEndLine(); + } + } } } + else{ + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); + currSeq.printSequence(scrapFASTA); + currQual.printQScores(scrapQual); + } + count++; } - else{ - currSeq.setName(currSeq.getName() + '|' + trashCode); - currSeq.setUnaligned(origSeq); - currSeq.printSequence(scrapFASTA); - } - gobble(inFASTA); + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= line->end)) { break; } + #else + if (inFASTA.eof()) { break; } + #endif + + //report progress + if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + } + //report progress + if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); - outGroups.close(); + if (oligoFile != "") { outGroups.close(); } + if(qFileName != "") { qFile.close(); scrapQual.close(); outQual.close(); } - for(int i=0;iclose(); - delete fastaFileNames[i]; - } + return count; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim"); + exit(1); + } +} + +/**************************************************************************************************/ + +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector fastaNames, vector qualNames) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 1; + int exitCommand = 1; + processIDS.clear(); - for(int i=0;i'){ - inFASTA >> seqName; - outGroups << seqName << '\t' << groupVector[i] << endl; + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + for (int i = 0; i < fastaNames.size(); i++) { + fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp"); + //clear old file if it exists + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + if(qFileName != ""){ + qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp"); + //clear old file if it exists + ofstream temp2; + m->openOutputFile(qualNames[i], temp2); + temp2.close(); + } } - while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + + driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]); + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //parent do my part + for (int i = 0; i < fastaNames.size(); i++) { + //clear old file if it exists + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + if(qFileName != ""){ + //clear old file if it exists + ofstream temp2; + m->openOutputFile(qualNames[i], temp2); + temp2.close(); } - outGroups.close(); - inFASTA.close(); } + driverCreateTrim(filename, qFileName, trimFile, scrapFile, trimQFile, scrapQFile, groupFile, fastaNames, qualNames, lines[0], qLines[0]); - return 0; + + //force parent to wait until all the processes are done + for (int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); + + m->appendFiles((trimFile + toString(processIDS[i]) + ".temp"), trimFile); + remove((trimFile + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((scrapFile + toString(processIDS[i]) + ".temp"), scrapFile); + remove((scrapFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with fasta files"); m->mothurOutEndLine(); + + if(qFileName != ""){ + m->appendFiles((trimQFile + toString(processIDS[i]) + ".temp"), trimQFile); + remove((trimQFile + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((scrapQFile + toString(processIDS[i]) + ".temp"), scrapQFile); + remove((scrapQFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with quality files"); m->mothurOutEndLine(); + } + + m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); + remove((groupFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with group file"); m->mothurOutEndLine(); + + for (int j = 0; j < fastaNames.size(); j++) { + m->appendFiles((fastaNames[j] + toString(processIDS[i]) + ".temp"), fastaNames[j]); + remove((fastaNames[j] + toString(processIDS[i]) + ".temp").c_str()); + } + + if(qFileName != ""){ + for (int j = 0; j < qualNames.size(); j++) { + m->appendFiles((qualNames[j] + toString(processIDS[i]) + ".temp"), qualNames[j]); + remove((qualNames[j] + toString(processIDS[i]) + ".temp").c_str()); + } + } + + if (allFiles) { m->mothurOut("Done with allfiles"); m->mothurOutEndLine(); } + } + + return exitCommand; +#endif } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; +} + +/**************************************************************************************************/ + +int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { + try { + + //set file positions for fasta file + fastaFilePos = m->divideFile(filename, processors); + + if (qfilename == "") { return processors; } + + //get name of first sequence in each chunk + map firstSeqNames; + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + ifstream in; + m->openInputFile(filename, in); + in.seekg(fastaFilePos[i]); + + Sequence temp(in); + firstSeqNames[temp.getName()] = i; + + in.close(); + } + + //seach for filePos of each first name in the qfile and save in qfileFilePos + ifstream inQual; + m->openInputFile(qfilename, inQual); + + string input; + while(!inQual.eof()){ + input = m->getline(inQual); + + if (input.length() != 0) { + if(input[0] == '>'){ //this is a sequence name line + istringstream nameStream(input); + + string sname = ""; nameStream >> sname; + sname = sname.substr(1); + + map::iterator it = firstSeqNames.find(sname); + + if(it != firstSeqNames.end()) { //this is the start of a new chunk + unsigned long int pos = inQual.tellg(); + qfileFilePos.push_back(pos - input.length() - 1); + firstSeqNames.erase(it); + } + } + } + + if (firstSeqNames.size() == 0) { break; } + } + inQual.close(); + + + if (firstSeqNames.size() != 0) { + for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); + } + qFileName = ""; + return processors; + } + + //get last file position of qfile + FILE * pFile; + unsigned long int size; + + //get num bytes in file + pFile = fopen (qfilename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + qfileFilePos.push_back(size); + + return processors; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "setLines"); exit(1); } } //*************************************************************************************************************** -void TrimSeqsCommand::getOligos(vector& outFASTAVec){ - - ifstream inOligos; - openInputFile(oligoFile, inOligos); +void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& outQualVec){ + try { + ifstream inOligos; + m->openInputFile(oligoFile, inOligos); + + ofstream test; + + string type, oligo, group; + int index=0; + //int indexPrimer = 0; + + while(!inOligos.eof()){ + inOligos >> type; m->gobble(inOligos); + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + } + else{ + //make type case insensitive + for(int i=0;i> oligo; + + for(int i=0;i::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=index; index++; + groupVector.push_back(group); + + if(allFiles){ + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct + filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + }else { + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + } + } + + } + else if(type == "REVERSE"){ + Sequence oligoRC("reverse", oligo); + oligoRC.reverseComplement(); + revPrimer.push_back(oligoRC.getUnaligned()); + } + else if(type == "BARCODE"){ + inOligos >> group; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=index; index++; + groupVector.push_back(group); + + if(allFiles){ + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + } + + }else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + } + m->gobble(inOligos); + } + + inOligos.close(); + + //add in potential combos + if(allFiles){ + comboStarts = outFASTAVec.size()-1; + for (map::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) { + for (map::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) { + if (groupVector[itPrime->second] != "") { //there is a group for this primer + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1; + + if(qFileName != ""){ + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + } + } + } + } + } + + numFPrimers = primers.size(); + numRPrimers = revPrimer.size(); + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOligos"); + exit(1); + } +} +//*************************************************************************************************************** - ofstream test; - - string type, oligo, group; - int index=0; +int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs +// cout << success; + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it +// cout << endl; - while(!inOligos.eof()){ - inOligos >> type; + int maxLength = 0; - if(type[0] == '#'){ - while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - } - else{ - inOligos >> oligo; + Alignment* alignment; + if (barcodes.size() > 0) { + map::iterator it=barcodes.begin(); + + for(it;it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } - for(int i=0;i::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; +// int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i> group; - barcodes[oligo]=index++; - groupVector.push_back(group); - - if(allFiles){ - outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); } + success = minDiff; } + + if (alignment != NULL) { delete alignment; } + } +// cout << success << endl; + + return success; + } - - inOligos.close(); - - numFPrimers = forPrimer.size(); - numRPrimers = revPrimer.size(); + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); + exit(1); + } + } //*************************************************************************************************************** -bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; +int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){ + try { + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + success = 0; + break; + } } + + //if you found the barcode or if you don't want to allow for diffs +// cout << success; + if ((pdiffs == 0) || (success == 0)) { return success; } - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; - break; + else { //try aligning and see if you can find it +// cout << endl; + + int maxLength = 0; + + Alignment* alignment; + if (primers.size() > 0) { + map::iterator it=primers.begin(); + + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; +// int length = oligo.length(); + + if(rawSequence.length() < maxLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "stripForward"); + exit(1); } - return success; - } //*************************************************************************************************************** -bool TrimSeqsCommand::stripForward(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "stripReverse"); + exit(1); + } +} - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; - break; +//*************************************************************************************************************** + +bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ + try { + bool success = 1; + if(qscores.getName() != ""){ + qscores.trimQScores(-1, keepFirst); } + sequence.trim(keepFirst); + return success; + } + catch(exception& e) { + m->errorOut(e, "keepFirstTrim", "countDiffs"); + exit(1); } - return success; - -} +} //*************************************************************************************************************** -bool TrimSeqsCommand::stripReverse(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;i 0){ + if(qscores.getName() != ""){ + qscores.trimQScores(-1, length); + } + sequence.trim(length); success = 1; - break; } - } - return success; + else{ + success = 0; + } + + return success; + } + catch(exception& e) { + m->errorOut(e, "removeLastTrim", "countDiffs"); + exit(1); + } -} +} //*************************************************************************************************************** bool TrimSeqsCommand::cullLength(Sequence& seq){ + try { - int length = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - - if(length >= minLength && maxLength == 0) { success = 1; } - else if(length >= minLength && length <= maxLength) { success = 1; } - else { success = 0; } + int length = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + + if(length >= minLength && maxLength == 0) { success = 1; } + else if(length >= minLength && length <= maxLength) { success = 1; } + else { success = 0; } + + return success; - return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullLength"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullHomoP(Sequence& seq){ - - int longHomoP = seq.getLongHomoPolymer(); - bool success = 0; //guilty until proven innocent - - if(longHomoP <= maxHomoP){ success = 1; } - else { success = 0; } - - return success; + try { + int longHomoP = seq.getLongHomoPolymer(); + bool success = 0; //guilty until proven innocent + + if(longHomoP <= maxHomoP){ success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullHomoP"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ - - int numNs = seq.getAmbigBases(); - bool success = 0; //guilty until proven innocent - - if(numNs <= maxAmbig) { success = 1; } - else { success = 0; } - - return success; + try { + int numNs = seq.getAmbigBases(); + bool success = 0; //guilty until proven innocent + + if(numNs <= maxAmbig) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullAmbigs"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); + exit(1); + } + } //*************************************************************************************************************** -bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ - - string rawSequence = seq.getUnaligned(); - int seqLength = rawSequence.length(); - string name; - - qFile >> name; - if(name.substr(1) != seq.getName()) { cout << "sequence name mismatch btwn fasta and qual file" << endl; } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - int score; - int end = seqLength; - - for(int i=0;i> score; +int TrimSeqsCommand::countDiffs(string oligo, string seq){ + try { - if(score <= qThreshold){ - end = i; - break; + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;i> score; + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "countDiffs"); + exit(1); } - seq.setUnaligned(rawSequence.substr(0,end)); - - return 1; -} - -//*************************************************************************************************************** - -bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ - - string rawSequence = seq.getUnaligned(); - int seqLength = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - string name; - - qFile >> name; - if(name.substr(1) != seq.getName()) { cout << "sequence name mismatch btwn fasta and qual file" << endl; } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - float score; - float average = 0; - - for(int i=0;i> score; - average += score; - } - average /= seqLength; - - if(average >= qAverage) { success = 1; } - else { success = 0; } - - return success; } //*************************************************************************************************************** - -