*/
#include "screenseqscommand.h"
-
+#include "counttable.h"
//**********************************************************************************************************************
vector<string> ScreenSeqsCommand::setParameters(){
try {
- CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
- CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
- CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
- CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
- CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
- CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
- CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
- CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
- CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
- CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
- CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
- CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
- CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
- CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
- CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
+ CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
+ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
+ CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
+ CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile);
+ CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none","alignreport",false,false); parameters.push_back(palignreport);
+ CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax);
+ CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart);
+ CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend);
+ CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
+ CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxhomop);
+ CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminlength);
+ CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxlength);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria);
+ CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
string ScreenSeqsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The screen.seqs command reads a fastafile and creates .....\n";
- helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
+ helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
+ helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
helpString += "The fasta parameter is required.\n";
helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
- helpString += "The start parameter .... The default is -1.\n";
- helpString += "The end parameter .... The default is -1.\n";
+ helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
+ helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
}
}
//**********************************************************************************************************************
+string ScreenSeqsCommand::getOutputPattern(string type) {
+ try {
+ string pattern = "";
+
+ if (type == "fasta") { pattern = "[filename],good,[extension]"; }
+ else if (type == "taxonomy") { pattern = "[filename],good,[extension]"; }
+ else if (type == "name") { pattern = "[filename],good,[extension]"; }
+ else if (type == "group") { pattern = "[filename],good,[extension]"; }
+ else if (type == "count") { pattern = "[filename],good,[extension]"; }
+ else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
+ else if (type == "qfile") { pattern = "[filename],good,[extension]"; }
+ else if (type == "alignreport") { pattern = "[filename],good.align.report"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
+
+ return pattern;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "getOutputPattern");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
ScreenSeqsCommand::ScreenSeqsCommand(){
try {
abort = true; calledHelp = true;
outputTypes["accnos"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
outputTypes["accnos"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["count"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
}
+
+ it = parameters.find("count");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["count"] = inputDir + it->second; }
+ }
}
//check for required parameters
else if (namefile == "not found") { namefile = ""; }
else { m->setNameFile(namefile); }
+ countfile = validParameter.validFile(parameters, "count", true);
+ if (countfile == "not open") { countfile = ""; abort = true; }
+ else if (countfile == "not found") { countfile = ""; }
+ else { m->setCountTableFile(countfile); }
+
+ if ((namefile != "") && (countfile != "")) {
+ m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+ }
+
+ if ((groupfile != "") && (countfile != "")) {
+ m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+ }
+
alignreport = validParameter.validFile(parameters, "alignreport", true);
if (alignreport == "not open") { abort = true; }
else if (alignreport == "not found") { alignreport = ""; }
temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
m->mothurConvert(temp, criteria);
- if (namefile == "") {
- vector<string> files; files.push_back(fastafile);
- parser.getNameFile(files);
- }
+ if (countfile == "") {
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
}
}
if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
//use the namefile to optimize correctly
if (namefile != "") { nameMap = m->readNames(namefile); }
+ else if (countfile != "") {
+ CountTable ct;
+ ct.readTable(countfile);
+ nameMap = ct.getNameMap();
+ }
getSummary(positions);
}
else {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
else {
int numFastaSeqs = 0;
positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numFastaSeqs / processors;
}
#endif
}
-
- string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
- string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
+ string badAccnosFile = getOutputFileName("accnos",variables);
+ variables["[extension]"] = m->getExtension(fastafile);
+ string goodSeqFile = getOutputFileName("fasta", variables);
+
int numFastaSeqs = 0;
set<string> badSeqNames;
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
-
- //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
- numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
- }else{
- numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
- }
- //#else
- // numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
- //#endif
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
+ if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+ else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+
+ if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
#endif
#ifdef USE_MPI
screenNameGroupFile(badSeqNames);
if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
}else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
-
+ else if (countfile != "") { screenCountFile(badSeqNames); }
+
+
if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
if(alignreport != "") { screenAlignReport(badSeqNames); }
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
}
+
+ itTypes = outputTypes.find("count");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+ }
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
m->mothurOutEndLine();
set<string> badSeqGroups;
string seqName, seqList, group;
set<string>::iterator it;
-
- string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
+ variables["[extension]"] = m->getExtension(namefile);
+ string goodNameFile = getOutputFileName("name", variables);
outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
ifstream inputGroups;
m->openInputFile(groupfile, inputGroups);
-
- string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
+ variables["[extension]"] = m->getExtension(groupfile);
+ string goodGroupFile = getOutputFileName("group", variables);
+
outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
vector<int> longHomoPolymer;
vector<unsigned long long> positions;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
else {
int numFastaSeqs = 0;
positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numFastaSeqs / processors;
driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
#else
int numSeqs = 0;
- //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
}else{
count++;
}
//if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
int num = 0;
vector<int> processIDS;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
m->openInputFile(groupfile, inputGroups);
string seqName, group;
set<string>::iterator it;
-
- string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
- outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
+ variables["[extension]"] = m->getExtension(groupfile);
+ string goodGroupFile = getOutputFileName("group", variables);
+ outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
while(!inputGroups.eof()){
exit(1);
}
}
+//***************************************************************************************************************
+int ScreenSeqsCommand::screenCountFile(set<string> badSeqNames){
+ try {
+ ifstream in;
+ m->openInputFile(countfile, in);
+ set<string>::iterator it;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
+ variables["[extension]"] = m->getExtension(countfile);
+ string goodCountFile = getOutputFileName("count", variables);
+
+ outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
+ ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
+
+ string headers = m->getline(in); m->gobble(in);
+ goodCountOut << headers << endl;
+
+ string name, rest; int thisTotal;
+ while (!in.eof()) {
+ if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> thisTotal; m->gobble(in);
+ rest = m->getline(in); m->gobble(in);
+
+ it = badSeqNames.find(name);
+
+ if(it != badSeqNames.end()){
+ badSeqNames.erase(it);
+ }
+ else{
+ goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
+ }
+ }
+
+ if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ m->mothurOut("Your count file does not include the sequence " + *it + " please correct.");
+ m->mothurOutEndLine();
+ }
+ }
+
+ in.close();
+ goodCountOut.close();
+
+ //check for groups that have been eliminated
+ CountTable ct;
+ if (ct.testGroups(goodCountFile)) {
+ ct.readTable(goodCountFile);
+ ct.printTable(goodCountFile);
+ }
+
+ if (m->control_pressed) { m->mothurRemove(goodCountFile); }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenCountFile");
+ exit(1);
+ }
+}
//***************************************************************************************************************
int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
string seqName, group;
set<string>::iterator it;
- string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
+ string goodAlignReportFile = getOutputFileName("alignreport", variables);
+
outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
m->openInputFile(taxonomy, input);
string seqName, tax;
set<string>::iterator it;
-
- string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + "good" + m->getExtension(taxonomy);
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy));
+ variables["[extension]"] = m->getExtension(taxonomy);
+ string goodTaxFile = getOutputFileName("taxonomy", variables);
+
outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile);
ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut);
ifstream in;
m->openInputFile(qualfile, in);
set<string>::iterator it;
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile));
+ variables["[extension]"] = m->getExtension(qualfile);
+ string goodQualFile = getOutputFileName("qfile", variables);
- string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
int length = MPIPos[start+i+1] - MPIPos[start+i];
char* buf4 = new char[length];
- memcpy(buf4, outputString.c_str(), length);
MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
int process = 1;
int num = 0;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
// Allocate memory for thread data.
- sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension, &badSeqNames);
+ sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
pDataArray.push_back(tempSum);
//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}