X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=screenseqscommand.cpp;h=5a9c0c8320b7305834cdf4bb544effd515728417;hb=974f20de01dc6ae71bbb0a7baf8860059cd2f27f;hp=6a9a61323b50a4ea688ba9faa7ea855a233bbab9;hpb=f687723a8357916e86a05116978e6869b039ce36;p=mothur.git diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 6a9a613..5a9c0c8 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -8,28 +8,29 @@ */ #include "screenseqscommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector ScreenSeqsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); - CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport); - CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); - CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); - CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); - CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); - CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop); - CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength); - CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria); - CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); + CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile); + CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none","alignreport",false,false); parameters.push_back(palignreport); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax); + CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend); + CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig); + CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxhomop); + CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminlength); + CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxlength); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria); + CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -44,8 +45,8 @@ vector ScreenSeqsCommand::setParameters(){ string ScreenSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The screen.seqs command reads a fastafile and creates .....\n"; - helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; + helpString += "The screen.seqs command reads a fastafile and screens sequences.\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n"; helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n"; @@ -71,32 +72,27 @@ string ScreenSeqsCommand::getHelpString(){ } } //********************************************************************************************************************** -string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string outputFileName = ""; - map >::iterator it; +string ScreenSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); } - else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); } - else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); } - else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); } - else if (type == "accnos") { outputFileName = "bad.accnos"; } - else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); } - else if (type == "alignreport") { outputFileName = "good.align.report"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return outputFileName; - } - catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "getOutputFileNameTag"); - exit(1); - } + if (type == "fasta") { pattern = "[filename],good,[extension]"; } + else if (type == "taxonomy") { pattern = "[filename],good,[extension]"; } + else if (type == "name") { pattern = "[filename],good,[extension]"; } + else if (type == "group") { pattern = "[filename],good,[extension]"; } + else if (type == "count") { pattern = "[filename],good,[extension]"; } + else if (type == "accnos") { pattern = "[filename],bad.accnos"; } + else if (type == "qfile") { pattern = "[filename],good,[extension]"; } + else if (type == "alignreport") { pattern = "[filename],good.align.report"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "getOutputPattern"); + exit(1); + } } - //********************************************************************************************************************** ScreenSeqsCommand::ScreenSeqsCommand(){ try { @@ -110,6 +106,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand"); @@ -149,6 +146,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -202,6 +200,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -229,6 +235,19 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + alignreport = validParameter.validFile(parameters, "alignreport", true); if (alignreport == "not open") { abort = true; } else if (alignreport == "not found") { alignreport = ""; } @@ -288,10 +307,12 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -312,6 +333,11 @@ int ScreenSeqsCommand::execute(){ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here //use the namefile to optimize correctly if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } getSummary(positions); } else { @@ -335,9 +361,13 @@ int ScreenSeqsCommand::execute(){ } #endif } - - string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile); - string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos"); + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); + string badAccnosFile = getOutputFileName("accnos",variables); + variables["[extension]"] = m->getExtension(fastafile); + string goodSeqFile = getOutputFileName("fasta", variables); + int numFastaSeqs = 0; set badSeqNames; @@ -472,7 +502,9 @@ int ScreenSeqsCommand::execute(){ screenNameGroupFile(badSeqNames); if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group - + else if (countfile != "") { screenCountFile(badSeqNames); } + + if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } if(alignreport != "") { screenAlignReport(badSeqNames); } @@ -519,6 +551,11 @@ int ScreenSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -540,8 +577,10 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ set badSeqGroups; string seqName, seqList, group; set::iterator it; - - string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string goodNameFile = getOutputFileName("name", variables); outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile); ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut); @@ -586,8 +625,10 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ ifstream inputGroups; m->openInputFile(groupfile, inputGroups); - - string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string goodGroupFile = getOutputFileName("group", variables); + outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); @@ -919,8 +960,10 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ m->openInputFile(groupfile, inputGroups); string seqName, group; set::iterator it; - - string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string goodGroupFile = getOutputFileName("group", variables); outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); @@ -962,7 +1005,72 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ exit(1); } } +//*************************************************************************************************************** +int ScreenSeqsCommand::screenCountFile(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string goodCountFile = getOutputFileName("count", variables); + + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + it = badSeqNames.find(name); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + } + else{ + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your count file does not include the sequence " + *it + " please correct."); + m->mothurOutEndLine(); + } + } + + in.close(); + goodCountOut.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenCountFile"); + exit(1); + } +} //*************************************************************************************************************** int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ @@ -972,7 +1080,10 @@ int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ string seqName, group; set::iterator it; - string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + getOutputFileNameTag("alignreport"); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport)); + string goodAlignReportFile = getOutputFileName("alignreport", variables); + outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile); ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut); @@ -1035,8 +1146,11 @@ int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ m->openInputFile(taxonomy, input); string seqName, tax; set::iterator it; - - string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + getOutputFileNameTag("taxonomy", taxonomy); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy)); + variables["[extension]"] = m->getExtension(taxonomy); + string goodTaxFile = getOutputFileName("taxonomy", variables); + outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile); ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut); @@ -1084,8 +1198,11 @@ int ScreenSeqsCommand::screenQual(set badSeqNames){ ifstream in; m->openInputFile(qualfile, in); set::iterator it; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile)); + variables["[extension]"] = m->getExtension(qualfile); + string goodQualFile = getOutputFileName("qfile", variables); - string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile); outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile); ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);