CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
+ CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
try {
string helpString = "";
helpString += "The screen.seqs command reads a fastafile and creates .....\n";
- helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, optimize, criteria and processors.\n";
+ helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
helpString += "The fasta parameter is required.\n";
+ helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
helpString += "The start parameter .... The default is -1.\n";
helpString += "The end parameter .... The default is -1.\n";
helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
outputTypes["alignreport"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
outputTypes["alignreport"] = tempOutNames;
outputTypes["accnos"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (path == "") { parameters["qfile"] = inputDir + it->second; }
}
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
}
//check for required parameters
alignreport = validParameter.validFile(parameters, "alignreport", true);
if (alignreport == "not open") { abort = true; }
- else if (alignreport == "not found") { alignreport = ""; }
+ else if (alignreport == "not found") { alignreport = ""; }
+
+ taxonomy = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxonomy == "not open") { abort = true; }
+ else if (taxonomy == "not found") { taxonomy = ""; }
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
// ...at some point should added some additional type checking...
string temp;
temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, startPos);
+ m->mothurConvert(temp, startPos);
temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, endPos);
+ m->mothurConvert(temp, endPos);
temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, maxAmbig);
+ m->mothurConvert(temp, maxAmbig);
temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, maxHomoP);
+ m->mothurConvert(temp, maxHomoP);
temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, minLength);
+ m->mothurConvert(temp, minLength);
temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
- convert(temp, maxLength);
+ m->mothurConvert(temp, maxLength);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
- convert(temp, processors);
+ m->mothurConvert(temp, processors);
temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
if (temp == "not found"){ temp = "none"; }
if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
- convert(temp, criteria);
+ m->mothurConvert(temp, criteria);
+
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
}
}
getSummary(positions);
}
else {
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
- }
+ }
+ #else
+ positions.push_back(0); positions.push_back(1000);
+ lines.push_back(new linePair(0, 1000));
+ #endif
}
string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
int numFastaSeqs = 0;
set<string> badSeqNames;
int start = time(NULL);
-
+
#ifdef USE_MPI
int pid, numSeqsPerProcessor;
int tag = 2001;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
+
MPI_File inMPI;
MPI_File outMPIGood;
MPI_File outMPIBadAccnos;
if(alignreport != "") { screenAlignReport(badSeqNames); }
if(qualfile != "") { screenQual(badSeqNames); }
+ if(taxonomy != "") { screenTaxonomy(badSeqNames); }
if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
if (itTypes != outputTypes.end()) {
if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
}
+
+ itTypes = outputTypes.find("taxonomy");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
+ }
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
m->mothurOutEndLine();
vector<int> ambigBases;
vector<int> longHomoPolymer;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
vector<unsigned long long> positions = m->divideFile(fastafile, processors);
-
+
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
}
+#else
+ lines.push_back(new linePair(0, 1000));
+#endif
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ if (pid == 0) {
+ driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
+#else
int numSeqs = 0;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
if (m->control_pressed) { return 0; }
#endif
-
+#endif
sort(startPosition.begin(), startPosition.end());
sort(endPosition.begin(), endPosition.end());
sort(seqLength.begin(), seqLength.end());
else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
}
+#ifdef USE_MPI
+ }
+
+ MPI_Status status;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ if (pid == 0) {
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
+ }
+ }else {
+ MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+ }
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+#endif
return 0;
}
catch(exception& e) {
count++;
}
-
+ //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
}
//***************************************************************************************************************
+int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
+ try {
+ ifstream input;
+ m->openInputFile(taxonomy, input);
+ string seqName, tax;
+ set<string>::iterator it;
+
+ string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + "good" + m->getExtension(taxonomy);
+ outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile);
+ ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut);
+
+ while(!input.eof()){
+ if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
+
+ input >> seqName >> tax;
+ it = badSeqNames.find(seqName);
+
+ if(it != badSeqNames.end()){ badSeqNames.erase(it); }
+ else{
+ goodTaxOut << seqName << '\t' << tax << endl;
+ }
+ m->gobble(input);
+ }
+
+ if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
+
+ //we were unable to remove some of the bad sequences
+ if (badSeqNames.size() != 0) {
+ for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct.");
+ m->mothurOutEndLine();
+ }
+ }
+
+ input.close();
+ goodTaxOut.close();
+
+ if (m->control_pressed) { m->mothurRemove(goodTaxFile); return 0; }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************
+
int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
try {
ifstream in;