X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=trimseqscommand.cpp;h=16d83de2596b9cf84ab4069789917878fa89559e;hp=d1b9a5c43549991bd3876164a318b8d9accb54ce;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=e840ba0e90a2b2be39e0910e4be23dde6a14cbac diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index d1b9a5c..16d83de 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -9,67 +9,136 @@ #include "trimseqscommand.h" #include "needlemanoverlap.hpp" +#include "trimoligos.h" -//********************************************************************************************************************** -vector TrimSeqsCommand::getValidParameters(){ +//********************************************************************************************************************** +vector TrimSeqsCommand::setParameters(){ try { - string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", - "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", - "keepfirst", "removelast", - "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); + CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip); + CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient); + CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig); + CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop); + CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength); + CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles); + CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward); + CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim); + CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold); + CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage); + CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage); + CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage); + CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize); + CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize); + CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst); + CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; } catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "getValidParameters"); + m->errorOut(e, "TrimSeqsCommand", "setParameters"); exit(1); } } - //********************************************************************************************************************** - -TrimSeqsCommand::TrimSeqsCommand(){ +string TrimSeqsCommand::getHelpString(){ try { - abort = true; calledHelp = true; - vector tempOutNames; - outputTypes["fasta"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; - outputTypes["group"] = tempOutNames; + string helpString = ""; + helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; + helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; + helpString += "The trim.seqs command parameters are fasta, name, count, flip, checkorient, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The fasta parameter is required.\n"; + helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; + helpString += "The checkorient parameter will check the reverse compliment of the sequence if the barcodes and primers cannot be found in the forward. The default is false.\n"; + helpString += "The oligos parameter allows you to provide an oligos file.\n"; + helpString += "The name parameter allows you to provide a names file with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file with your fasta file.\n"; + helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; + helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; + helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; + helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + helpString += "The qfile parameter allows you to provide a quality file.\n"; + helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n"; + helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n"; + helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n"; + helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n"; + helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n"; + helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n"; + helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; + helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n"; + helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n"; + helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n"; + helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n"; + helpString += "The trim.seqs command should be in the following format: \n"; + helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"; + helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"; + helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"; + helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; + helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n"; + return helpString; } catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); + m->errorOut(e, "TrimSeqsCommand", "getHelpString"); exit(1); } } - //********************************************************************************************************************** - -vector TrimSeqsCommand::getRequiredParameters(){ - try { - string Array[] = {"fasta"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - return myArray; - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters"); - exit(1); - } +string TrimSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "qfile") { pattern = "[filename],[tag],qual"; } + else if (type == "fasta") { pattern = "[filename],[tag],fasta"; } + else if (type == "group") { pattern = "[filename],groups"; } + else if (type == "name") { pattern = "[filename],[tag],names"; } + else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOutputPattern"); + exit(1); + } } - //********************************************************************************************************************** -vector TrimSeqsCommand::getRequiredFiles(){ +TrimSeqsCommand::TrimSeqsCommand(){ try { - vector myArray; - return myArray; + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; + outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles"); + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); exit(1); } } - //*************************************************************************************************************** TrimSeqsCommand::TrimSeqsCommand(string option) { @@ -80,15 +149,10 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { - //valid paramters for this command - string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", - "keepfirst", "removelast", - "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; - - vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + vector myArray = setParameters(); OptionParser parser(option); map parameters = parser.getParameters(); @@ -106,6 +170,8 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -136,13 +202,33 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (path == "") { parameters["qfile"] = inputDir + it->second; } } + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + } //check for required parameters fastaFile = validParameter.validFile(parameters, "fasta", true); - if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; } - else if (fastaFile == "not open") { abort = true; } + if (fastaFile == "not found") { + fastaFile = m->getFastaFile(); + if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + }else if (fastaFile == "not open") { abort = true; } + else { m->setFastaFile(fastaFile); } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -155,45 +241,63 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { // ...at some point should added some additional type checking... string temp; temp = validParameter.validFile(parameters, "flip", false); - if (temp == "not found"){ flip = 0; } - else if(m->isTrue(temp)) { flip = 1; } + if (temp == "not found") { flip = 0; } + else { flip = m->isTrue(temp); } temp = validParameter.validFile(parameters, "oligos", true); if (temp == "not found"){ oligoFile = ""; } else if(temp == "not open"){ abort = true; } - else { oligoFile = temp; } + else { oligoFile = temp; m->setOligosFile(oligoFile); } temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } - convert(temp, maxAmbig); + m->mothurConvert(temp, maxAmbig); temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxHomoP); + m->mothurConvert(temp, maxHomoP); temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, minLength); + m->mothurConvert(temp, minLength); temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxLength); + m->mothurConvert(temp, maxLength); temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, bdiffs); + m->mothurConvert(temp, bdiffs); temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, pdiffs); + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); - temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } - convert(temp, tdiffs); + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } else if(temp == "not open") { abort = true; } - else { qFileName = temp; } + else { qFileName = temp; m->setQualFile(qFileName); } + + temp = validParameter.validFile(parameters, "name", true); + if (temp == "not found") { nameFile = ""; } + else if(temp == "not open") { nameFile = ""; abort = true; } + else { nameFile = temp; m->setNameFile(nameFile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } - convert(temp, qThreshold); + m->mothurConvert(temp, qThreshold); temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; } qtrim = m->isTrue(temp); @@ -221,9 +325,16 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; } + keepforward = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "checkorient", false); if (temp == "not found") { temp = "F"; } + reorient = m->isTrue(temp); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } - convert(temp, processors); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); if(allFiles && (oligoFile == "")){ @@ -238,6 +349,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } + } } } @@ -246,54 +364,6 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { exit(1); } } - -//********************************************************************************************************************** - -void TrimSeqsCommand::help(){ - try { - m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"); - m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"); - m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"); - m->mothurOut("The fasta parameter is required.\n"); - m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); - m->mothurOut("The oligos parameter allows you to provide an oligos file.\n"); - m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"); - m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n"); - m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n"); - m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n"); - m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"); - m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"); - m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"); - m->mothurOut("The qfile parameter allows you to provide a quality file.\n"); - m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n"); - m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n"); - m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n"); - m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n"); - m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n"); - m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n"); - m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"); - m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n"); - m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n"); - m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n"); - m->mothurOut("The trim.seqs command should be in the following format: \n"); - m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); - m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); - m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); - m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "help"); - exit(1); - } -} - - -//*************************************************************************************************************** - -TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ } - //*************************************************************************************************************** int TrimSeqsCommand::execute(){ @@ -301,19 +371,28 @@ int TrimSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } + pairedOligos = false; numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + numSpacers = 0; + numLinkers = 0; + createGroup = false; vector > fastaFileNames; vector > qualFileNames; + vector > nameFileNames; - string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + variables["[tag]"] = "trim"; + string trimSeqFile = getOutputFileName("fasta",variables); + string trimQualFile = getOutputFileName("qfile",variables); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - - string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + + variables["[tag]"] = "scrap"; + string scrapSeqFile = getOutputFileName("fasta",variables); + string scrapQualFile = getOutputFileName("qfile",variables); outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); - string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; - string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; if (qFileName != "") { outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile); @@ -321,58 +400,201 @@ int TrimSeqsCommand::execute(){ outputTypes["qfile"].push_back(scrapQualFile); } - string outputGroupFileName; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + variables["[tag]"] = "trim"; + string trimNameFile = getOutputFileName("name",variables); + variables["[tag]"] = "scrap"; + string scrapNameFile = getOutputFileName("name",variables); + + if (nameFile != "") { + m->readNames(nameFile, nameMap); + outputNames.push_back(trimNameFile); + outputNames.push_back(scrapNameFile); + outputTypes["name"].push_back(trimNameFile); + outputTypes["name"].push_back(scrapNameFile); + } + + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[tag]"] = "trim"; + string trimCountFile = getOutputFileName("count",variables); + variables["[tag]"] = "scrap"; + string scrapCountFile = getOutputFileName("count",variables); + + if (countfile != "") { + CountTable ct; + ct.readTable(countfile, true); + nameCount = ct.getNameMap(); + outputNames.push_back(trimCountFile); + outputNames.push_back(scrapCountFile); + outputTypes["count"].push_back(trimCountFile); + outputTypes["count"].push_back(scrapCountFile); + } + + if (m->control_pressed) { return 0; } + + string outputGroupFileName; if(oligoFile != ""){ - outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; - outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); - getOligos(fastaFileNames, qualFileNames); + createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); + if ((createGroup) && (countfile == "")){ + map myvariables; + myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + outputGroupFileName = getOutputFileName("group",myvariables); + outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); + } } - - vector fastaFilePos; - vector qFilePos; + + if (!pairedOligos) { if (reorient) { m->mothurOut("[WARNING]: You cannot use reorient without paired barcodes or primers, skipping."); m->mothurOutEndLine(); reorient = false; } } + + if (m->control_pressed) { return 0; } + + //fills lines and qlines + setLines(fastaFile, qFileName); - setLines(fastaFile, qFileName, fastaFilePos, qFilePos); + if(processors == 1){ + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + }else{ + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); + } - for (int i = 0; i < (fastaFilePos.size()-1); i++) { - lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)])); - if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); } - } - if(qFileName == "") { qLines = lines; } //files with duds - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]); - }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); - } - #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]); - #endif if (m->control_pressed) { return 0; } - - + if(allFiles){ + map uniqueFastaNames;// so we don't add the same groupfile multiple times + map::iterator it; + set namesToRemove; for(int i=0;iisBlank(fastaFileNames[i][j])){ - remove(fastaFileNames[i][j].c_str()); - - if(qFileName != ""){ - remove(fastaFileNames[i][j].c_str()); + if (fastaFileNames[i][j] != "") { + if (namesToRemove.count(fastaFileNames[i][j]) == 0) { + if(m->isBlank(fastaFileNames[i][j])){ + m->mothurRemove(fastaFileNames[i][j]); + namesToRemove.insert(fastaFileNames[i][j]); + + if(qFileName != ""){ + m->mothurRemove(qualFileNames[i][j]); + namesToRemove.insert(qualFileNames[i][j]); + } + + if(nameFile != ""){ + m->mothurRemove(nameFileNames[i][j]); + namesToRemove.insert(nameFileNames[i][j]); + } + }else{ + it = uniqueFastaNames.find(fastaFileNames[i][j]); + if (it == uniqueFastaNames.end()) { + uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i]; + } + } } - } } } + + //remove names for outputFileNames, just cleans up the output + vector outputNames2; + for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } } + outputNames = outputNames2; + + for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { + ifstream in; + m->openInputFile(it->first, in); + + ofstream out; + map myvariables; + myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first)); + string thisGroupName = ""; + if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } + else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } + m->openOutputFile(thisGroupName, out); + + if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; } + + while (!in.eof()){ + if (m->control_pressed) { break; } + + Sequence currSeq(in); m->gobble(in); + if (countfile == "") { + out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + }else { + map::iterator itTotalReps = nameCount.find(currSeq.getName()); + if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + } + in.close(); + out.close(); + } + + if (countfile != "") { //create countfile with group info included + CountTable* ct = new CountTable(); + ct->readTable(trimCountFile, true); + map justTrimmedNames = ct->getNameMap(); + delete ct; + + CountTable newCt; + for (map::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); } + vector tempCounts; tempCounts.resize(groupCounts.size(), 0); + for (map::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) { + newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance + map::iterator it2 = groupMap.find(itNames->first); + if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); } + else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; } + } + newCt.printTable(trimCountFile); + } + } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //output group counts + m->mothurOutEndLine(); + int total = 0; + if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); } + for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine(); } + if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + itTypes = outputTypes.find("qfile"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } + } - if (m->control_pressed) { - for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } - return 0; + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } } m->mothurOutEndLine(); @@ -390,8 +612,7 @@ int TrimSeqsCommand::execute(){ } /**************************************************************************************/ - -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, linePair* line, linePair* qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -408,16 +629,36 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string m->openOutputFile(scrapQFileName, scrapQualFile); } - ofstream outGroupsFile; - if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); } + ofstream trimNameFile; + ofstream scrapNameFile; + if(nameFile != ""){ + m->openOutputFile(trimNFileName, trimNameFile); + m->openOutputFile(scrapNFileName, scrapNameFile); + } + ofstream trimCountFile; + ofstream scrapCountFile; + if(countfile != ""){ + m->openOutputFile(trimCFileName, trimCountFile); + m->openOutputFile(scrapCFileName, scrapCountFile); + if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } + + ofstream outGroupsFile; + if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); } if(allFiles){ for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file - ofstream temp; - m->openOutputFile(fastaFileNames[i][j], temp); temp.close(); - if(qFileName != ""){ - m->openOutputFile(qualFileNames[i][j], temp); temp.close(); + if (fastaFileNames[i][j] != "") { + ofstream temp; + m->openOutputFile(fastaFileNames[i][j], temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(qualFileNames[i][j], temp); temp.close(); + } + + if(nameFile != ""){ + m->openOutputFile(nameFileNames[i][j], temp); temp.close(); + } } } } @@ -425,29 +666,48 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string ifstream inFASTA; m->openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(line.start); ifstream qFile; if(qFileName != "") { m->openInputFile(qFileName, qFile); - qFile.seekg(qline->start); + qFile.seekg(qline.start); } int count = 0; bool moreSeqs = 1; - + int numBarcodes = barcodes.size(); + TrimOligos* trimOligos = NULL; + if (pairedOligos) { trimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, pairedPrimers, pairedBarcodes); numBarcodes = pairedBarcodes.size(); } + else { trimOligos = new TrimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); } + + TrimOligos* rtrimOligos = NULL; + if (reorient) { + //create reoriented primer and barcode pairs + map rpairedPrimers, rpairedBarcodes; + for (map::iterator it = pairedPrimers.begin(); it != pairedPrimers.end(); it++) { + oligosPair tempPair(reverseOligo((it->second).reverse), (reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer + rpairedPrimers[it->first] = tempPair; + //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << primerNameVector[it->first] << endl; + } + for (map::iterator it = pairedBarcodes.begin(); it != pairedBarcodes.end(); it++) { + oligosPair tempPair(reverseOligo((it->second).reverse), (reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode + rpairedBarcodes[it->first] = tempPair; + //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << barcodeNameVector[it->first] << endl; + } + rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size(); + } + while (moreSeqs) { - if (m->control_pressed) { + if (m->control_pressed) { + delete trimOligos; if (reorient) { delete rtrimOligos; } inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (oligoFile != "") { outGroupsFile.close(); } - - if(qFileName != ""){ - qFile.close(); - } - for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } - - return 0; + if ((createGroup) && (countfile == "")) { outGroupsFile.close(); } + if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int success = 1; @@ -455,37 +715,96 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int currentSeqsDiffs = 0; Sequence currSeq(inFASTA); m->gobble(inFASTA); - - QualityScores currQual; + //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl; + Sequence savedSeq(currSeq.getName(), currSeq.getAligned()); + + QualityScores currQual; QualityScores savedQual; if(qFileName != ""){ currQual = QualityScores(qFile); m->gobble(qFile); + savedQual.setName(currQual.getName()); savedQual.setScores(currQual.getScores()); + //cout << currQual.getName() << endl; } - + string origSeq = currSeq.getUnaligned(); if (origSeq != "") { int barcodeIndex = 0; int primerIndex = 0; - if(barcodes.size() != 0){ - success = stripBarcode(currSeq, currQual, barcodeIndex); - if(success > bdiffs) { trashCode += 'b'; } + if(numLinkers != 0){ + success = trimOligos->stripLinker(currSeq, currQual); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + + } + + if(numBarcodes != 0){ + success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex); + if(success > bdiffs) { + trashCode += 'b'; + } else{ currentSeqsDiffs += success; } } + if(numSpacers != 0){ + success = trimOligos->stripSpacer(currSeq, currQual); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + if(numFPrimers != 0){ - success = stripForward(currSeq, currQual, primerIndex); - if(success > pdiffs) { trashCode += 'f'; } + success = trimOligos->stripForward(currSeq, currQual, primerIndex, keepforward); + if(success > pdiffs) { + trashCode += 'f'; + } else{ currentSeqsDiffs += success; } } if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } if(numRPrimers != 0){ - success = stripReverse(currSeq, currQual); + success = trimOligos->stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } - + + if (reorient && (trashCode != "")) { //if you failed and want to check the reverse + int thisSuccess = 0; + string thisTrashCode = ""; + int thisCurrentSeqsDiffs = 0; + + int thisBarcodeIndex = 0; + int thisPrimerIndex = 0; + + if(numBarcodes != 0){ + thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex); + if(thisSuccess > bdiffs) { thisTrashCode += "b"; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if(numFPrimers != 0){ + thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, keepforward); + if(thisSuccess > pdiffs) { thisTrashCode += "f"; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if (thisCurrentSeqsDiffs > tdiffs) { thisTrashCode += 't'; } + + if (thisTrashCode == "") { + trashCode = thisTrashCode; + success = thisSuccess; + currentSeqsDiffs = thisCurrentSeqsDiffs; + barcodeIndex = thisBarcodeIndex; + primerIndex = thisPrimerIndex; + savedSeq.reverseComplement(); + currSeq.setAligned(savedSeq.getAligned()); + if(qFileName != ""){ + savedQual.flipQScores(); + currQual.setScores(savedQual.getScores()); + } + }else { trashCode += "(" + thisTrashCode + ")"; } + } + if(keepFirst != 0){ success = keepFirstTrim(currSeq, currQual); } @@ -531,33 +850,113 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } + if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } } + if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); - currSeq.printSequence(trimFASTAFile); - - if(qFileName != ""){ - currQual.printQScores(trimQualFile); - } - - if(barcodes.size() != 0){ - outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl; - } - - - if(allFiles){ - ofstream output; - m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output); - currSeq.printSequence(output); - output.close(); - - if(qFileName != ""){ - m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output); - currQual.printQScores(output); - output.close(); - } - } + string thisGroup = ""; + if (createGroup) { + if(numBarcodes != 0){ + thisGroup = barcodeNameVector[barcodeIndex]; + if (numFPrimers != 0) { + if (primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + primerNameVector[primerIndex]; + }else { + thisGroup = primerNameVector[primerIndex]; + } + } + } + } + } + + int pos = thisGroup.find("ignore"); + if (pos == string::npos) { + currSeq.setAligned(currSeq.getUnaligned()); + currSeq.printSequence(trimFASTAFile); + + if(qFileName != ""){ + currQual.printQScores(trimQualFile); + } + + + if(nameFile != ""){ + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + + int numRedundants = 0; + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + + if (createGroup) { + if(numBarcodes != 0){ + + if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } + + if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { groupMap[currSeq.getName()] = thisGroup; } + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + + map::iterator it = groupCounts.find(thisGroup); + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; } + else { groupCounts[it->first] += (1 + numRedundants); } + + } + } + + if(allFiles){ + ofstream output; + m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output); + currSeq.printSequence(output); + output.close(); + + if(qFileName != ""){ + m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output); + currQual.printQScores(output); + output.close(); + } + + if(nameFile != ""){ + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output); + output << itName->first << '\t' << itName->second << endl; + output.close(); + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + } + } } else{ + if(nameFile != ""){ //needs to be before the currSeq name is changed + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -569,26 +968,30 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= line->end)) { break; } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= line.end)) { break; } + #else if (inFASTA.eof()) { break; } #endif - + //report progress if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } //report progress if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - + delete trimOligos; + if (reorient) { delete rtrimOligos; } inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (oligoFile != "") { outGroupsFile.close(); } + if (createGroup) { outGroupsFile.close(); } if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } return count; } @@ -600,14 +1003,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames) { +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + + int process = 1; int exitCommand = 1; processIDS.clear(); - //loop through and create all the processes you want +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -618,18 +1022,25 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName vector > tempFASTAFileNames = fastaFileNames; vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; if(allFiles){ ofstream temp; for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if(qFileName != ""){ - tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + if (tempFASTAFileNames[i][j] != "") { + tempFASTAFileNames[i][j] += toString(getpid()) + ".temp"; + m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += toString(getpid()) + ".temp"; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } } } } @@ -641,12 +1052,37 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName (scrapFASTAFileName + toString(getpid()) + ".temp"), (trimQualFileName + toString(getpid()) + ".temp"), (scrapQualFileName + toString(getpid()) + ".temp"), + (trimNameFileName + toString(getpid()) + ".temp"), + (scrapNameFileName + toString(getpid()) + ".temp"), + (trimCountFileName + toString(getpid()) + ".temp"), + (scrapCountFileName + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, + tempNameFileNames, lines[process], qLines[process]); + + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); } + //pass groupCounts to parent + if(createGroup){ + ofstream out; + string tempFile = filename + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + + out << groupCounts.size() << endl; + + for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + out << it->first << '\t' << it->second << endl; + } + + out << groupMap.size() << endl; + for (map::iterator it = groupMap.begin(); it != groupMap.end(); it++) { + out << it->first << '\t' << it->second << endl; + } + out.close(); + } exit(0); }else { m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); @@ -659,59 +1095,254 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName ofstream temp; m->openOutputFile(trimFASTAFileName, temp); temp.close(); m->openOutputFile(scrapFASTAFileName, temp); temp.close(); - m->openOutputFile(trimQualFileName, temp); temp.close(); - m->openOutputFile(scrapQualFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } + if (countfile != "") { + m->openOutputFile(trimCountFileName, temp); temp.close(); + m->openOutputFile(scrapCountFileName, temp); temp.close(); + } - - - driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); - + driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); //force parent to wait until all the processes are done for (int i=0;i pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int h=0; h > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + + trimData* tempTrim = new trimData(filename, + qFileName, nameFile, countfile, + (trimFASTAFileName+extension), + (scrapFASTAFileName+extension), + (trimQualFileName+extension), + (scrapQualFileName+extension), + (trimNameFileName+extension), + (scrapNameFileName+extension), + (trimCountFileName+extension), + (scrapCountFileName+extension), + (groupFile+extension), + tempFASTAFileNames, + tempPrimerQualFileNames, + tempNameFileNames, + lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, pairedBarcodes, pairedPrimers, pairedOligos, + primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, + qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, + minLength, maxAmbig, maxHomoP, maxLength, flip, reorient, nameMap, nameCount); + pDataArray.push_back(tempTrim); + + hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); + } + + //parent do my part + ofstream temp; + m->openOutputFile(trimFASTAFileName, temp); temp.close(); + m->openOutputFile(scrapFASTAFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } + vector > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + if(allFiles){ + ofstream temp; + string extension = toString(processors-1) + ".temp"; + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]); + processIDS.push_back(processors-1); + + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + if (pDataArray[i]->count != pDataArray[i]->lineEnd) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->lineEnd) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { + map::iterator it2 = groupCounts.find(it->first); + if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } + else { groupCounts[it->first] += it->second; } + } + for (map::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) { + map::iterator it2 = groupMap.find(it->first); + if (it2 == groupMap.end()) { groupMap[it->first] = it->second; } + else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + + + //append files for(int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName); - remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName); - remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp")); if(qFileName != ""){ m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName); - remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName); - remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp")); } - m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); - remove((groupFile + toString(processIDS[i]) + ".temp").c_str()); + if(nameFile != ""){ + m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName); + m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName); + m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp")); + } + + if(countfile != ""){ + m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName); + m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName); + m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp")); + } + + if((createGroup)&&(countfile == "")){ + m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); + m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp")); + } if(allFiles){ for(int j=0;jappendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); - remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str()); - - if(qFileName != ""){ - m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); - remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str()); + if (fastaFileNames[j][k] != "") { + m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); + m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp")); + + if(qFileName != ""){ + m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); + m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp")); + } + + if(nameFile != ""){ + m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]); + m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp")); + } } } } } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + if(createGroup){ + ifstream in; + string tempFile = filename + toString(processIDS[i]) + ".num.temp"; + m->openInputFile(tempFile, in); + int tempNum; + string group; + + in >> tempNum; m->gobble(in); + + if (tempNum != 0) { + for (int i = 0; i < tempNum; i++) { + int groupNum; + in >> group >> groupNum; m->gobble(in); + + map::iterator it = groupCounts.find(group); + if (it == groupCounts.end()) { groupCounts[group] = groupNum; } + else { groupCounts[it->first] += groupNum; } + } + } + in >> tempNum; m->gobble(in); + if (tempNum != 0) { + for (int i = 0; i < tempNum; i++) { + string group, seqName; + in >> seqName >> group; m->gobble(in); + + map::iterator it = groupMap.find(seqName); + if (it == groupMap.end()) { groupMap[seqName] = group; } + else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + } + + in.close(); m->mothurRemove(tempFile); + } + #endif } - - return exitCommand; -#endif + + return exitCommand; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); @@ -721,14 +1352,16 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename) { try { + + vector fastaFilePos; + vector qfileFilePos; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); - if (qfilename == "") { return processors; } - //get name of first sequence in each chunk map firstSeqNames; for (int i = 0; i < (fastaFilePos.size()-1); i++) { @@ -741,61 +1374,107 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectoropenInputFile(qfilename, inQual); - - string input; - while(!inQual.eof()){ - input = m->getline(inQual); - - if (input.length() != 0) { - if(input[0] == '>'){ //this is a sequence name line - istringstream nameStream(input); - - string sname = ""; nameStream >> sname; - sname = sname.substr(1); - - map::iterator it = firstSeqNames.find(sname); - - if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long int pos = inQual.tellg(); - qfileFilePos.push_back(pos - input.length() - 1); - firstSeqNames.erase(it); - } - } - } - - if (firstSeqNames.size() == 0) { break; } - } - inQual.close(); + if(qfilename != "") { + //seach for filePos of each first name in the qfile and save in qfileFilePos + ifstream inQual; + m->openInputFile(qfilename, inQual); + + string input; + while(!inQual.eof()){ + input = m->getline(inQual); + + if (input.length() != 0) { + if(input[0] == '>'){ //this is a sequence name line + istringstream nameStream(input); + + string sname = ""; nameStream >> sname; + sname = sname.substr(1); + + m->checkName(sname); + + map::iterator it = firstSeqNames.find(sname); + + if(it != firstSeqNames.end()) { //this is the start of a new chunk + unsigned long long pos = inQual.tellg(); + qfileFilePos.push_back(pos - input.length() - 1); + firstSeqNames.erase(it); + } + } + } + + if (firstSeqNames.size() == 0) { break; } + } + inQual.close(); + + + if (firstSeqNames.size() != 0) { + for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); + } + qFileName = ""; + return processors; + } + + //get last file position of qfile + FILE * pFile; + unsigned long long size; + + //get num bytes in file + pFile = fopen (qfilename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + qfileFilePos.push_back(size); + } + + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); } + lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); } + } + if(qfilename == "") { qLines = lines; } //files with duds - if (firstSeqNames.size() != 0) { - for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); - } - qFileName = ""; - return processors; - } - - //get last file position of qfile - FILE * pFile; - unsigned long int size; - - //get num bytes in file - pFile = fopen (qfilename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } + return processors; - qfileFilePos.push_back(size); + #else + + if (processors == 1) { //save time + //fastaFilePos.push_back(0); qfileFilePos.push_back(0); + //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000); + lines.push_back(linePair(0, 1000)); + if (qfilename != "") { qLines.push_back(linePair(0, 1000)); } + }else{ + int numFastaSeqs = 0; + fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); + if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); } + + if (qfilename != "") { + int numQualSeqs = 0; + qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs); + + if (numFastaSeqs != numQualSeqs) { + m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true; + } + } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } + } + } + if(qfilename == "") { qLines = lines; } //files with duds + return 1; - return processors; + #endif } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "setLines"); @@ -805,30 +1484,41 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector >& fastaFileNames, vector >& qualFileNames){ +bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector >& qualFileNames, vector >& nameFileNames){ try { ifstream inOligos; m->openInputFile(oligoFile, inOligos); ofstream test; - string type, oligo, group; + string type, oligo, roligo, group; + bool hasPrimer = false; bool hasPairedBarcodes = false; int indexPrimer = 0; int indexBarcode = 0; + int indexPairedPrimer = 0; + int indexPairedBarcode = 0; + set uniquePrimers; + set uniqueBarcodes; while(!inOligos.eof()){ - inOligos >> type; m->gobble(inOligos); - + inOligos >> type; + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + if(type[0] == '#'){ - while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); } else{ + m->gobble(inOligos); //make type case insensitive for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } for(int i=0;i >& fastaFileNames, vector< // get rest of line in case there is a primer name while (!inOligos.eof()) { char c = inOligos.get(); - if (c == 10 || c == 13){ break; } + if (c == 10 || c == 13 || c == -1){ break; } else if (c == 32 || c == 9){;} //space or tab else { group += c; } } @@ -850,32 +1540,116 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } } + primers[oligo]=indexPrimer; indexPrimer++; primerNameVector.push_back(group); } + else if (type == "PRIMER"){ + m->gobble(inOligos); + + inOligos >> roligo; + + for(int i=0;idebug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); } + + //check for repeat barcodes + string tempPair = oligo+roligo; + if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); } + else { uniquePrimers.insert(tempPair); } + + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } } + + pairedPrimers[indexPairedPrimer]=newPrimer; indexPairedPrimer++; + primerNameVector.push_back(group); + hasPrimer = true; + } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13 || c == -1){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } - //check for repeat barcodes - map::iterator itBar = barcodes.find(oligo); - if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - - barcodes[oligo]=indexBarcode; indexBarcode++; - barcodeNameVector.push_back(group); + //then this is illumina data with 4 columns + if (temp != "") { + hasPairedBarcodes = true; + string reverseBarcode = group; //reverseOligo(group); //reverse barcode + group = temp; + + for(int i=0;idebug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); } + + //check for repeat barcodes + string tempPair = oligo+reverseBarcode; + if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); } + else { uniqueBarcodes.insert(tempPair); } + + pairedBarcodes[indexPairedBarcode]=newPair; indexPairedBarcode++; + barcodeNameVector.push_back(group); + }else { + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=indexBarcode; indexBarcode++; + barcodeNameVector.push_back(group); + } + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); } - else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } m->gobble(inOligos); } inOligos.close(); + if (hasPairedBarcodes || hasPrimer) { + pairedOligos = true; + if ((primers.size() != 0) || (barcodes.size() != 0) || (linker.size() != 0) || (spacer.size() != 0) || (revPrimer.size() != 0)) { m->control_pressed = true; m->mothurOut("[ERROR]: cannot mix paired primers and barcodes with non paired or linkers and spacers, quitting."); m->mothurOutEndLine(); return 0; } + }else if (reorient) { m->mothurOut("[Warning]: cannot use checkorient without paired barcodes or primers, ignoring.\n"); m->mothurOutEndLine(); reorient = false; } + if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; } - + //add in potential combos if(barcodeNameVector.size() == 0){ barcodes[""] = 0; @@ -891,330 +1665,183 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< for(int i=0;i::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ - for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ - - string primerName = primerNameVector[itPrimer->second]; - string barcodeName = barcodeNameVector[itBar->second]; - - string comboGroupName = ""; - string fastaFileName = ""; - string qualFileName = ""; - - if(primerName == ""){ - comboGroupName = barcodeNameVector[itBar->second]; - } - else{ - if(barcodeName == ""){ - comboGroupName = primerNameVector[itPrimer->second]; - } - else{ - comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; - } - } - - ofstream temp; - fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta"; - outputNames.push_back(fastaFileName); - outputTypes["fasta"].push_back(fastaFileName); - fastaFileNames[itBar->second][itPrimer->second] = fastaFileName; - m->openOutputFile(fastaFileName, temp); temp.close(); - - if(qFileName != ""){ - qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual"; - outputNames.push_back(qualFileName); - outputTypes["qfile"].push_back(qualFileName); - qualFileNames[itBar->second][itPrimer->second] = qualFileName; - m->openOutputFile(qualFileName, temp); temp.close(); - } - } - } + set uniqueNames; //used to cleanup outputFileNames + if (pairedOligos) { + for(map::iterator itBar = pairedBarcodes.begin();itBar != pairedBarcodes.end();itBar++){ + for(map::iterator itPrimer = pairedPrimers.begin();itPrimer != pairedPrimers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->first]; + string barcodeName = barcodeNameVector[itBar->first]; + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else { + string comboGroupName = ""; + string fastaFileName = ""; + string qualFileName = ""; + string nameFileName = ""; + string countFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->first]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->first]; + } + else{ + comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first]; + } + } + + + ofstream temp; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + variables["[tag]"] = comboGroupName; + fastaFileName = getOutputFileName("fasta", variables); + if (uniqueNames.count(fastaFileName) == 0) { + outputNames.push_back(fastaFileName); + outputTypes["fasta"].push_back(fastaFileName); + uniqueNames.insert(fastaFileName); + } + + fastaFileNames[itBar->first][itPrimer->first] = fastaFileName; + m->openOutputFile(fastaFileName, temp); temp.close(); + + if(qFileName != ""){ + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName)); + qualFileName = getOutputFileName("qfile", variables); + if (uniqueNames.count(qualFileName) == 0) { + outputNames.push_back(qualFileName); + outputTypes["qfile"].push_back(qualFileName); + } + + qualFileNames[itBar->first][itPrimer->first] = qualFileName; + m->openOutputFile(qualFileName, temp); temp.close(); + } + + if(nameFile != ""){ + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + nameFileName = getOutputFileName("name", variables); + if (uniqueNames.count(nameFileName) == 0) { + outputNames.push_back(nameFileName); + outputTypes["name"].push_back(nameFileName); + } + + nameFileNames[itBar->first][itPrimer->first] = nameFileName; + m->openOutputFile(nameFileName, temp); temp.close(); + } + } + } + } + }else { + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->second]; + string barcodeName = barcodeNameVector[itBar->second]; + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else { + string comboGroupName = ""; + string fastaFileName = ""; + string qualFileName = ""; + string nameFileName = ""; + string countFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->second]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->second]; + } + else{ + comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; + } + } + + + ofstream temp; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + variables["[tag]"] = comboGroupName; + fastaFileName = getOutputFileName("fasta", variables); + if (uniqueNames.count(fastaFileName) == 0) { + outputNames.push_back(fastaFileName); + outputTypes["fasta"].push_back(fastaFileName); + uniqueNames.insert(fastaFileName); + } + + fastaFileNames[itBar->second][itPrimer->second] = fastaFileName; + m->openOutputFile(fastaFileName, temp); temp.close(); + + if(qFileName != ""){ + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName)); + qualFileName = getOutputFileName("qfile", variables); + if (uniqueNames.count(qualFileName) == 0) { + outputNames.push_back(qualFileName); + outputTypes["qfile"].push_back(qualFileName); + } + + qualFileNames[itBar->second][itPrimer->second] = qualFileName; + m->openOutputFile(qualFileName, temp); temp.close(); + } + + if(nameFile != ""){ + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + nameFileName = getOutputFileName("name", variables); + if (uniqueNames.count(nameFileName) == 0) { + outputNames.push_back(nameFileName); + outputTypes["name"].push_back(nameFileName); + } + + nameFileNames[itBar->second][itPrimer->second] = nameFileName; + m->openOutputFile(nameFileName, temp); temp.close(); + } + } + } + } + } } numFPrimers = primers.size(); + if (pairedOligos) { numFPrimers = pairedPrimers.size(); } numRPrimers = revPrimer.size(); - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "getOligos"); - exit(1); - } -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { + numLinkers = linker.size(); + numSpacers = spacer.size(); - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - - success = 0; + bool allBlank = true; + for (int i = 0; i < barcodeNameVector.size(); i++) { + if (barcodeNameVector[i] != "") { + allBlank = false; break; } } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); - - for(it;it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){ - try { - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - success = 0; + for (int i = 0; i < primerNameVector.size(); i++) { + if (primerNameVector[i] != "") { + allBlank = false; break; } } - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it=primers.begin(); - - for(it;it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); + allFiles = false; + return false; } - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripForward"); - exit(1); - } -} - -//*************************************************************************************************************** - -bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "stripReverse"); + m->errorOut(e, "TrimSeqsCommand", "getOligos"); exit(1); } } - //*************************************************************************************************************** bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ @@ -1223,7 +1850,13 @@ bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ if(qscores.getName() != ""){ qscores.trimQScores(-1, keepFirst); } + +// sequence.printSequence(cout);cout << endl; + sequence.trim(keepFirst); + +// sequence.printSequence(cout);cout << endl << endl;; + return success; } catch(exception& e) { @@ -1301,6 +1934,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //*************************************************************************************************************** @@ -1320,80 +1993,4 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ } } - -//*************************************************************************************************************** - -bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::countDiffs(string oligo, string seq){ - try { - - int length = oligo.length(); - int countDiffs = 0; - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "countDiffs"); - exit(1); - } - -} - //***************************************************************************************************************