X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=screenseqscommand.cpp;h=9f7aa40b3cc6786388a4e9eb8e47326661460e2d;hb=1a1ed6dda1d655ff006459f15c712f057c93ddaf;hp=937959c9c1458fffd66d4af72e8525fcf0ae389d;hpb=4f9a6e14a608172f8a97f0297a3b8e6ea267c518;p=mothur.git diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 937959c..9f7aa40 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -11,15 +11,60 @@ #include "sequence.hpp" //********************************************************************************************************************** -vector ScreenSeqsCommand::getValidParameters(){ +vector ScreenSeqsCommand::setParameters(){ try { - string Array[] = {"fasta", "start", "end", "maxambig", "maxhomop","optimize","criteria", "minlength", "maxlength", - "name", "group", "alignreport","processors","outputdir","inputdir"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); + CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport); + CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); + CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); + CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop); + CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength); + CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria); + CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; } catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "getValidParameters"); + m->errorOut(e, "ScreenSeqsCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ScreenSeqsCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The screen.seqs command reads a fastafile and creates .....\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, optimize, criteria and processors.\n"; + helpString += "The fasta parameter is required.\n"; + helpString += "The start parameter .... The default is -1.\n"; + helpString += "The end parameter .... The default is -1.\n"; + helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; + helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; + helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; + helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; + helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n"; + helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n"; + helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n"; + helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n"; + helpString += "The screen.seqs command should be in the following format: \n"; + helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n"; + helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"; + helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"; + helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "getHelpString"); exit(1); } } @@ -27,41 +72,20 @@ vector ScreenSeqsCommand::getValidParameters(){ ScreenSeqsCommand::ScreenSeqsCommand(){ try { abort = true; calledHelp = true; + setParameters(); vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand"); exit(1); } } -//********************************************************************************************************************** -vector ScreenSeqsCommand::getRequiredParameters(){ - try { - string Array[] = {"fasta"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - return myArray; - } - catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "getRequiredParameters"); - exit(1); - } -} -//********************************************************************************************************************** -vector ScreenSeqsCommand::getRequiredFiles(){ - try { - vector myArray; - return myArray; - } - catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "getRequiredFiles"); - exit(1); - } -} //*************************************************************************************************************** ScreenSeqsCommand::ScreenSeqsCommand(string option) { @@ -70,12 +94,10 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { - //valid paramters for this command - string AlignArray[] = {"fasta", "start", "end", "maxambig", "maxhomop","optimize","criteria", "minlength", "maxlength", - "name", "group", "alignreport","processors","outputdir","inputdir"}; - vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + vector myArray = setParameters(); OptionParser parser(option); map parameters = parser.getParameters(); @@ -95,6 +117,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -132,21 +155,42 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["alignreport"] = inputDir + it->second; } } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } + } //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; } - else if (fastafile == "not open") { abort = true; } + if (fastafile == "not found") { + fastafile = m->getFastaFile(); + if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + } + else if (fastafile == "not open") { abort = true; } + else { m->setFastaFile(fastafile); } groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } + + qualfile = validParameter.validFile(parameters, "qfile", true); + if (qualfile == "not open") { abort = true; } + else if (qualfile == "not found") { qualfile = ""; } + else { m->setQualFile(qualfile); } namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } - + else { m->setNameFile(namefile); } + alignreport = validParameter.validFile(parameters, "alignreport", true); if (alignreport == "not open") { abort = true; } else if (alignreport == "not found") { alignreport = ""; } @@ -178,17 +222,17 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; } convert(temp, maxLength); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } - convert(temp, processors); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + convert(temp, processors); temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value - if (temp == "not found"){ temp = ""; } - else { m->splitAtDash(temp, optimize); } + if (temp == "not found"){ temp = "none"; } + m->splitAtDash(temp, optimize); //check for invalid optimize options set validOptimizers; - validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); - + validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); for (int i = 0; i < optimize.size(); i++) { if (validOptimizers.count(optimize[i]) == 0) { m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine(); @@ -197,6 +241,8 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { } } + if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } } + temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } convert(temp, criteria); } @@ -207,40 +253,6 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { exit(1); } } -//********************************************************************************************************************** - -void ScreenSeqsCommand::help(){ - try { - m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n"); - m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, optimize, criteria and processors.\n"); - m->mothurOut("The fasta parameter is required.\n"); - m->mothurOut("The start parameter .... The default is -1.\n"); - m->mothurOut("The end parameter .... The default is -1.\n"); - m->mothurOut("The maxambig parameter .... The default is -1.\n"); - m->mothurOut("The maxhomop parameter .... The default is -1.\n"); - m->mothurOut("The minlength parameter .... The default is -1.\n"); - m->mothurOut("The maxlength parameter .... The default is -1.\n"); - m->mothurOut("The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n"); - m->mothurOut("The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n"); - m->mothurOut("For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n"); - m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n"); - m->mothurOut("The screen.seqs command should be in the following format: \n"); - m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n"); - m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); - m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); - - } - catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "help"); - exit(1); - } -} - -//*************************************************************************************************************** - -ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ } - //*************************************************************************************************************** int ScreenSeqsCommand::execute(){ @@ -470,6 +482,7 @@ int ScreenSeqsCommand::execute(){ if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; } if(alignreport != "") { screenAlignReport(badSeqNames); } + if(qualfile != "") { screenQual(badSeqNames); } if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; } @@ -484,6 +497,28 @@ int ScreenSeqsCommand::execute(){ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); m->mothurOutEndLine(); + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + itTypes = outputTypes.find("qfile"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -634,10 +669,10 @@ int ScreenSeqsCommand::getSummary(vector& positions){ for (int i = 0; i < optimize.size(); i++) { if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); } - else if (optimize[i] == "end") { int endcriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();} + else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();} else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); } else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); } - else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); } else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); } } @@ -898,6 +933,79 @@ int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ exit(1); } +} +//*************************************************************************************************************** + +int ScreenSeqsCommand::screenQual(set badSeqNames){ + try { + ifstream in; + m->openInputFile(qualfile, in); + set::iterator it; + + string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile); + outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile); + ofstream goodQual; m->openOutputFile(goodQualFile, goodQual); + + while(!in.eof()){ + + if (m->control_pressed) { goodQual.close(); in.close(); remove(goodQualFile.c_str()); return 0; } + + string saveName = ""; + string name = ""; + string scores = ""; + + in >> name; + + if (name.length() != 0) { + saveName = name.substr(1); + while (!in.eof()) { + char c = in.get(); + if (c == 10 || c == 13){ break; } + else { name += c; } + } + m->gobble(in); + } + + while(in){ + char letter= in.get(); + if(letter == '>'){ in.putback(letter); break; } + else{ scores += letter; } + } + + m->gobble(in); + + it = badSeqNames.find(saveName); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + }else{ + goodQual << name << endl << scores; + } + + m->gobble(in); + } + + in.close(); + goodQual.close(); + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your qual file does not include the sequence " + *it + " please correct."); + m->mothurOutEndLine(); + } + } + + if (m->control_pressed) { remove(goodQualFile.c_str()); return 0; } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenQual"); + exit(1); + } + } //**********************************************************************************************************************