2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "counttable.h"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
19 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
21 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
22 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
23 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
24 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
25 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
26 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
27 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
28 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
29 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
31 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
32 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
35 vector<string> myArray;
36 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
40 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
44 //**********************************************************************************************************************
45 string ScreenSeqsCommand::getHelpString(){
47 string helpString = "";
48 helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
49 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
50 helpString += "The fasta parameter is required.\n";
51 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
52 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
53 helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
54 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
55 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
56 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
57 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
58 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
59 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
60 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
61 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
62 helpString += "The screen.seqs command should be in the following format: \n";
63 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
64 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
65 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
66 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
70 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
74 //**********************************************************************************************************************
75 string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
77 string outputFileName = "";
78 map<string, vector<string> >::iterator it;
80 //is this a type this command creates
81 it = outputTypes.find(type);
82 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
84 if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); }
85 else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); }
86 else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); }
87 else if (type == "count") { outputFileName = "good" + m->getExtension(inputName); }
88 else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); }
89 else if (type == "accnos") { outputFileName = "bad.accnos"; }
90 else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); }
91 else if (type == "alignreport") { outputFileName = "good.align.report"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
94 return outputFileName;
97 m->errorOut(e, "ScreenSeqsCommand", "getOutputFileNameTag");
102 //**********************************************************************************************************************
103 ScreenSeqsCommand::ScreenSeqsCommand(){
105 abort = true; calledHelp = true;
107 vector<string> tempOutNames;
108 outputTypes["fasta"] = tempOutNames;
109 outputTypes["name"] = tempOutNames;
110 outputTypes["group"] = tempOutNames;
111 outputTypes["alignreport"] = tempOutNames;
112 outputTypes["accnos"] = tempOutNames;
113 outputTypes["qfile"] = tempOutNames;
114 outputTypes["taxonomy"] = tempOutNames;
115 outputTypes["count"] = tempOutNames;
117 catch(exception& e) {
118 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
122 //***************************************************************************************************************
124 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
126 abort = false; calledHelp = false;
128 //allow user to run help
129 if(option == "help") { help(); abort = true; calledHelp = true; }
130 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
133 vector<string> myArray = setParameters();
135 OptionParser parser(option);
136 map<string,string> parameters = parser.getParameters();
138 ValidParameters validParameter("screen.seqs");
139 map<string,string>::iterator it;
141 //check to make sure all parameters are valid for command
142 for (it = parameters.begin(); it != parameters.end(); it++) {
143 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
146 //initialize outputTypes
147 vector<string> tempOutNames;
148 outputTypes["fasta"] = tempOutNames;
149 outputTypes["name"] = tempOutNames;
150 outputTypes["group"] = tempOutNames;
151 outputTypes["alignreport"] = tempOutNames;
152 outputTypes["accnos"] = tempOutNames;
153 outputTypes["qfile"] = tempOutNames;
154 outputTypes["taxonomy"] = tempOutNames;
155 outputTypes["count"] = tempOutNames;
157 //if the user changes the input directory command factory will send this info to us in the output parameter
158 string inputDir = validParameter.validFile(parameters, "inputdir", false);
159 if (inputDir == "not found"){ inputDir = ""; }
162 it = parameters.find("fasta");
163 //user has given a template file
164 if(it != parameters.end()){
165 path = m->hasPath(it->second);
166 //if the user has not given a path then, add inputdir. else leave path alone.
167 if (path == "") { parameters["fasta"] = inputDir + it->second; }
170 it = parameters.find("group");
171 //user has given a template file
172 if(it != parameters.end()){
173 path = m->hasPath(it->second);
174 //if the user has not given a path then, add inputdir. else leave path alone.
175 if (path == "") { parameters["group"] = inputDir + it->second; }
178 it = parameters.find("name");
179 //user has given a template file
180 if(it != parameters.end()){
181 path = m->hasPath(it->second);
182 //if the user has not given a path then, add inputdir. else leave path alone.
183 if (path == "") { parameters["name"] = inputDir + it->second; }
186 it = parameters.find("alignreport");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
194 it = parameters.find("qfile");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["qfile"] = inputDir + it->second; }
202 it = parameters.find("taxonomy");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
210 it = parameters.find("count");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["count"] = inputDir + it->second; }
219 //check for required parameters
220 fastafile = validParameter.validFile(parameters, "fasta", true);
221 if (fastafile == "not found") {
222 fastafile = m->getFastaFile();
223 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
224 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
226 else if (fastafile == "not open") { abort = true; }
227 else { m->setFastaFile(fastafile); }
229 groupfile = validParameter.validFile(parameters, "group", true);
230 if (groupfile == "not open") { abort = true; }
231 else if (groupfile == "not found") { groupfile = ""; }
232 else { m->setGroupFile(groupfile); }
234 qualfile = validParameter.validFile(parameters, "qfile", true);
235 if (qualfile == "not open") { abort = true; }
236 else if (qualfile == "not found") { qualfile = ""; }
237 else { m->setQualFile(qualfile); }
239 namefile = validParameter.validFile(parameters, "name", true);
240 if (namefile == "not open") { namefile = ""; abort = true; }
241 else if (namefile == "not found") { namefile = ""; }
242 else { m->setNameFile(namefile); }
244 countfile = validParameter.validFile(parameters, "count", true);
245 if (countfile == "not open") { countfile = ""; abort = true; }
246 else if (countfile == "not found") { countfile = ""; }
247 else { m->setCountTableFile(countfile); }
249 if ((namefile != "") && (countfile != "")) {
250 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
253 if ((groupfile != "") && (countfile != "")) {
254 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
257 alignreport = validParameter.validFile(parameters, "alignreport", true);
258 if (alignreport == "not open") { abort = true; }
259 else if (alignreport == "not found") { alignreport = ""; }
261 taxonomy = validParameter.validFile(parameters, "taxonomy", true);
262 if (taxonomy == "not open") { abort = true; }
263 else if (taxonomy == "not found") { taxonomy = ""; }
265 //if the user changes the output directory command factory will send this info to us in the output parameter
266 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
268 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
271 //check for optional parameter and set defaults
272 // ...at some point should added some additional type checking...
274 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
275 m->mothurConvert(temp, startPos);
277 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
278 m->mothurConvert(temp, endPos);
280 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
281 m->mothurConvert(temp, maxAmbig);
283 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
284 m->mothurConvert(temp, maxHomoP);
286 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
287 m->mothurConvert(temp, minLength);
289 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
290 m->mothurConvert(temp, maxLength);
292 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
293 m->setProcessors(temp);
294 m->mothurConvert(temp, processors);
296 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
297 if (temp == "not found"){ temp = "none"; }
298 m->splitAtDash(temp, optimize);
300 //check for invalid optimize options
301 set<string> validOptimizers;
302 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
303 for (int i = 0; i < optimize.size(); i++) {
304 if (validOptimizers.count(optimize[i]) == 0) {
305 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
306 optimize.erase(optimize.begin()+i);
311 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
313 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
314 m->mothurConvert(temp, criteria);
316 if (countfile == "") {
317 if (namefile == "") {
318 vector<string> files; files.push_back(fastafile);
319 parser.getNameFile(files);
325 catch(exception& e) {
326 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
330 //***************************************************************************************************************
332 int ScreenSeqsCommand::execute(){
335 if (abort == true) { if (calledHelp) { return 0; } return 2; }
337 //if the user want to optimize we need to know the 90% mark
338 vector<unsigned long long> positions;
339 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
340 //use the namefile to optimize correctly
341 if (namefile != "") { nameMap = m->readNames(namefile); }
342 else if (countfile != "") {
344 ct.readTable(countfile);
345 nameMap = ct.getNameMap();
347 getSummary(positions);
350 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
351 positions = m->divideFile(fastafile, processors);
352 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
354 if(processors == 1){ lines.push_back(linePair(0, 1000)); }
356 int numFastaSeqs = 0;
357 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
358 if (positions.size() < processors) { processors = positions.size(); }
360 //figure out how many sequences you have to process
361 int numSeqsPerProcessor = numFastaSeqs / processors;
362 for (int i = 0; i < processors; i++) {
363 int startIndex = i * numSeqsPerProcessor;
364 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
365 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
371 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
372 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
374 int numFastaSeqs = 0;
375 set<string> badSeqNames;
376 int start = time(NULL);
379 int pid, numSeqsPerProcessor;
381 vector<unsigned long long> MPIPos;
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
385 MPI_Comm_size(MPI_COMM_WORLD, &processors);
389 MPI_File outMPIBadAccnos;
391 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
392 int inMode=MPI_MODE_RDONLY;
394 char outGoodFilename[1024];
395 strcpy(outGoodFilename, goodSeqFile.c_str());
397 char outBadAccnosFilename[1024];
398 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
400 char inFileName[1024];
401 strcpy(inFileName, fastafile.c_str());
403 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
404 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
405 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
407 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
409 if (pid == 0) { //you are the root process
411 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
413 //send file positions to all processes
414 for(int i = 1; i < processors; i++) {
415 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
419 //figure out how many sequences you have to align
420 numSeqsPerProcessor = numFastaSeqs / processors;
421 int startIndex = pid * numSeqsPerProcessor;
422 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
425 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
427 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
429 for (int i = 1; i < processors; i++) {
432 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
434 }else{ //you are a child process
435 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
436 MPIPos.resize(numFastaSeqs+1);
437 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
439 //figure out how many sequences you have to align
440 numSeqsPerProcessor = numFastaSeqs / processors;
441 int startIndex = pid * numSeqsPerProcessor;
442 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
445 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
447 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
450 int badSize = badSeqNames.size();
451 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
455 MPI_File_close(&inMPI);
456 MPI_File_close(&outMPIGood);
457 MPI_File_close(&outMPIBadAccnos);
458 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
461 if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
462 else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
464 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
468 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
470 if (pid == 0) { //only one process should fix files
472 //read accnos file with all names in it, process 0 just has its names
473 MPI_File inMPIAccnos;
476 char inFileName[1024];
477 strcpy(inFileName, badAccnosFile.c_str());
479 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
480 MPI_File_get_size(inMPIAccnos, &size);
482 char* buffer = new char[size];
483 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
485 string tempBuf = buffer;
486 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
487 istringstream iss (tempBuf,istringstream::in);
490 MPI_File_close(&inMPIAccnos);
495 iss >> tempName; m->gobble(iss);
496 badSeqNames.insert(tempName);
500 if(namefile != "" && groupfile != "") {
501 screenNameGroupFile(badSeqNames);
502 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
503 }else if(namefile != "") {
504 screenNameGroupFile(badSeqNames);
505 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
506 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
507 else if (countfile != "") { screenCountFile(badSeqNames); }
510 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
512 if(alignreport != "") { screenAlignReport(badSeqNames); }
513 if(qualfile != "") { screenQual(badSeqNames); }
514 if(taxonomy != "") { screenTaxonomy(badSeqNames); }
516 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
522 m->mothurOutEndLine();
523 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
524 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
525 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
526 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
527 m->mothurOutEndLine();
528 m->mothurOutEndLine();
530 //set fasta file as new current fastafile
532 itTypes = outputTypes.find("fasta");
533 if (itTypes != outputTypes.end()) {
534 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
537 itTypes = outputTypes.find("name");
538 if (itTypes != outputTypes.end()) {
539 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
542 itTypes = outputTypes.find("group");
543 if (itTypes != outputTypes.end()) {
544 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
547 itTypes = outputTypes.find("qfile");
548 if (itTypes != outputTypes.end()) {
549 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
552 itTypes = outputTypes.find("taxonomy");
553 if (itTypes != outputTypes.end()) {
554 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
557 itTypes = outputTypes.find("count");
558 if (itTypes != outputTypes.end()) {
559 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
562 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
563 m->mothurOutEndLine();
567 catch(exception& e) {
568 m->errorOut(e, "ScreenSeqsCommand", "execute");
573 //***************************************************************************************************************
575 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
578 m->openInputFile(namefile, inputNames);
579 set<string> badSeqGroups;
580 string seqName, seqList, group;
581 set<string>::iterator it;
583 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
584 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
586 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
588 while(!inputNames.eof()){
589 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
591 inputNames >> seqName >> seqList;
592 it = badSeqNames.find(seqName);
594 if(it != badSeqNames.end()){
595 badSeqNames.erase(it);
599 for(int i=0;i<seqList.length();i++){
600 if(seqList[i] == ','){
601 badSeqGroups.insert(seqList.substr(start,i-start));
605 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
609 goodNameOut << seqName << '\t' << seqList << endl;
611 m->gobble(inputNames);
616 //we were unable to remove some of the bad sequences
617 if (badSeqNames.size() != 0) {
618 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
619 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
620 m->mothurOutEndLine();
626 ifstream inputGroups;
627 m->openInputFile(groupfile, inputGroups);
629 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
630 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
632 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
634 while(!inputGroups.eof()){
635 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; }
637 inputGroups >> seqName >> group;
639 it = badSeqGroups.find(seqName);
641 if(it != badSeqGroups.end()){
642 badSeqGroups.erase(it);
645 goodGroupOut << seqName << '\t' << group << endl;
647 m->gobble(inputGroups);
650 goodGroupOut.close();
652 //we were unable to remove some of the bad sequences
653 if (badSeqGroups.size() != 0) {
654 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
655 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
656 m->mothurOutEndLine();
665 catch(exception& e) {
666 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
670 //***************************************************************************************************************
671 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
674 vector<int> startPosition;
675 vector<int> endPosition;
676 vector<int> seqLength;
677 vector<int> ambigBases;
678 vector<int> longHomoPolymer;
680 vector<unsigned long long> positions;
681 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
682 positions = m->divideFile(fastafile, processors);
683 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
685 if(processors == 1){ lines.push_back(linePair(0, 1000)); }
687 int numFastaSeqs = 0;
688 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
689 if (positions.size() < processors) { processors = positions.size(); }
691 //figure out how many sequences you have to process
692 int numSeqsPerProcessor = numFastaSeqs / processors;
693 for (int i = 0; i < processors; i++) {
694 int startIndex = i * numSeqsPerProcessor;
695 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
696 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
703 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
706 driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
709 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
711 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
713 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
716 if (m->control_pressed) { return 0; }
718 // numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
719 // if (m->control_pressed) { return 0; }
722 sort(startPosition.begin(), startPosition.end());
723 sort(endPosition.begin(), endPosition.end());
724 sort(seqLength.begin(), seqLength.end());
725 sort(ambigBases.begin(), ambigBases.end());
726 sort(longHomoPolymer.begin(), longHomoPolymer.end());
728 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
729 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
731 for (int i = 0; i < optimize.size(); i++) {
732 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
733 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
734 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
735 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
736 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
737 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
744 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
745 MPI_Comm_size(MPI_COMM_WORLD, &processors);
748 //send file positions to all processes
749 for(int i = 1; i < processors; i++) {
750 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
751 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
752 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
753 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
754 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
755 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
758 MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
759 MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
760 MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
761 MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
762 MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
763 MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
765 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
769 catch(exception& e) {
770 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
774 /**************************************************************************************/
775 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {
779 m->openInputFile(filename, in);
781 in.seekg(filePos.start);
788 if (m->control_pressed) { in.close(); return 1; }
790 Sequence current(in); m->gobble(in);
792 if (current.getName() != "") {
794 if (namefile != "") {
795 //make sure this sequence is in the namefile, else error
796 map<string, int>::iterator it = nameMap.find(current.getName());
798 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
799 else { num = it->second; }
802 //for each sequence this sequence represents
803 for (int i = 0; i < num; i++) {
804 startPosition.push_back(current.getStartPos());
805 endPosition.push_back(current.getEndPos());
806 seqLength.push_back(current.getNumBases());
807 ambigBases.push_back(current.getAmbigBases());
808 longHomoPolymer.push_back(current.getLongHomoPolymer());
813 //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
814 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
815 unsigned long long pos = in.tellg();
816 if ((pos == -1) || (pos >= filePos.end)) { break; }
818 if (in.eof()) { break; }
827 catch(exception& e) {
828 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
832 /**************************************************************************************************/
833 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
838 vector<int> processIDS;
840 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
842 //loop through and create all the processes you want
843 while (process != processors) {
847 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
850 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
852 //pass numSeqs to parent
854 string tempFile = fastafile + toString(getpid()) + ".num.temp";
855 m->openOutputFile(tempFile, out);
858 out << startPosition.size() << endl;
859 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
860 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
861 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
862 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
863 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
869 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
870 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
875 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
877 //force parent to wait until all the processes are done
878 for (int i=0;i<processIDS.size();i++) {
879 int temp = processIDS[i];
883 //parent reads in and combine Filter info
884 for (int i = 0; i < processIDS.size(); i++) {
885 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
887 m->openInputFile(tempFilename, in);
890 in >> tempNum; m->gobble(in); num += tempNum;
891 in >> tempNum; m->gobble(in);
892 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
893 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
894 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
895 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
896 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
899 m->mothurRemove(tempFilename);
904 //////////////////////////////////////////////////////////////////////////////////////////////////////
905 //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
906 //Above fork() will clone, so memory is separate, but that's not the case with windows,
907 //Taking advantage of shared memory to allow both threads to add info to vectors.
908 //////////////////////////////////////////////////////////////////////////////////////////////////////
910 vector<sumData*> pDataArray;
911 DWORD dwThreadIdArray[processors-1];
912 HANDLE hThreadArray[processors-1];
914 //Create processor worker threads.
915 for( int i=0; i<processors-1; i++ ){
917 // Allocate memory for thread data.
918 sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
919 pDataArray.push_back(tempSum);
921 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
922 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
923 hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
927 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
929 //Wait until all threads have terminated.
930 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
932 //Close all thread handles and free memory allocations.
933 for(int i=0; i < pDataArray.size(); i++){
934 num += pDataArray[i]->count;
935 for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
936 for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
937 for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
938 for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); }
939 for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); }
940 CloseHandle(hThreadArray[i]);
941 delete pDataArray[i];
947 catch(exception& e) {
948 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
953 //***************************************************************************************************************
955 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
957 ifstream inputGroups;
958 m->openInputFile(groupfile, inputGroups);
959 string seqName, group;
960 set<string>::iterator it;
962 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
963 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
964 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
966 while(!inputGroups.eof()){
967 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
969 inputGroups >> seqName >> group;
970 it = badSeqNames.find(seqName);
972 if(it != badSeqNames.end()){
973 badSeqNames.erase(it);
976 goodGroupOut << seqName << '\t' << group << endl;
978 m->gobble(inputGroups);
981 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
983 //we were unable to remove some of the bad sequences
984 if (badSeqNames.size() != 0) {
985 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
986 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
987 m->mothurOutEndLine();
992 goodGroupOut.close();
994 if (m->control_pressed) { m->mothurRemove(goodGroupFile); }
999 catch(exception& e) {
1000 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
1004 //***************************************************************************************************************
1005 int ScreenSeqsCommand::screenCountFile(set<string> badSeqNames){
1008 m->openInputFile(countfile, in);
1009 set<string>::iterator it;
1011 string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
1012 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1013 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1015 string headers = m->getline(in); m->gobble(in);
1016 goodCountOut << headers << endl;
1018 string name, rest; int thisTotal;
1021 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1023 in >> name; m->gobble(in);
1024 in >> thisTotal; m->gobble(in);
1025 rest = m->getline(in); m->gobble(in);
1027 it = badSeqNames.find(name);
1029 if(it != badSeqNames.end()){
1030 badSeqNames.erase(it);
1033 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1037 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1039 //we were unable to remove some of the bad sequences
1040 if (badSeqNames.size() != 0) {
1041 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1042 m->mothurOut("Your count file does not include the sequence " + *it + " please correct.");
1043 m->mothurOutEndLine();
1048 goodCountOut.close();
1050 //check for groups that have been eliminated
1052 if (ct.testGroups(goodCountFile)) {
1053 ct.readTable(goodCountFile);
1054 ct.printTable(goodCountFile);
1057 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1062 catch(exception& e) {
1063 m->errorOut(e, "ScreenSeqsCommand", "screenCountFile");
1067 //***************************************************************************************************************
1069 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
1071 ifstream inputAlignReport;
1072 m->openInputFile(alignreport, inputAlignReport);
1073 string seqName, group;
1074 set<string>::iterator it;
1076 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + getOutputFileNameTag("alignreport");
1077 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
1078 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
1080 while (!inputAlignReport.eof()) { // need to copy header
1081 char c = inputAlignReport.get();
1082 goodAlignReportOut << c;
1083 if (c == 10 || c == 13){ break; }
1086 while(!inputAlignReport.eof()){
1087 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
1089 inputAlignReport >> seqName;
1090 it = badSeqNames.find(seqName);
1092 while (!inputAlignReport.eof()) { // need to copy header
1093 char c = inputAlignReport.get();
1095 if (c == 10 || c == 13){ break; }
1098 if(it != badSeqNames.end()){
1099 badSeqNames.erase(it);
1102 goodAlignReportOut << seqName << '\t' << line;
1104 m->gobble(inputAlignReport);
1107 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
1109 //we were unable to remove some of the bad sequences
1110 if (badSeqNames.size() != 0) {
1111 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1112 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
1113 m->mothurOutEndLine();
1117 inputAlignReport.close();
1118 goodAlignReportOut.close();
1120 if (m->control_pressed) { m->mothurRemove(goodAlignReportFile); return 0; }
1125 catch(exception& e) {
1126 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
1131 //***************************************************************************************************************
1133 int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
1136 m->openInputFile(taxonomy, input);
1137 string seqName, tax;
1138 set<string>::iterator it;
1140 string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + getOutputFileNameTag("taxonomy", taxonomy);
1141 outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile);
1142 ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut);
1144 while(!input.eof()){
1145 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1147 input >> seqName >> tax;
1148 it = badSeqNames.find(seqName);
1150 if(it != badSeqNames.end()){ badSeqNames.erase(it); }
1152 goodTaxOut << seqName << '\t' << tax << endl;
1157 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1159 //we were unable to remove some of the bad sequences
1160 if (badSeqNames.size() != 0) {
1161 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1162 m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct.");
1163 m->mothurOutEndLine();
1170 if (m->control_pressed) { m->mothurRemove(goodTaxFile); return 0; }
1175 catch(exception& e) {
1176 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
1181 //***************************************************************************************************************
1183 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
1186 m->openInputFile(qualfile, in);
1187 set<string>::iterator it;
1189 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile);
1190 outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
1191 ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
1195 if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
1197 string saveName = "";
1203 if (name.length() != 0) {
1204 saveName = name.substr(1);
1207 if (c == 10 || c == 13){ break; }
1214 char letter= in.get();
1215 if(letter == '>'){ in.putback(letter); break; }
1216 else{ scores += letter; }
1221 it = badSeqNames.find(saveName);
1223 if(it != badSeqNames.end()){
1224 badSeqNames.erase(it);
1226 goodQual << name << endl << scores;
1235 //we were unable to remove some of the bad sequences
1236 if (badSeqNames.size() != 0) {
1237 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1238 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
1239 m->mothurOutEndLine();
1243 if (m->control_pressed) { m->mothurRemove(goodQualFile); return 0; }
1248 catch(exception& e) {
1249 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1254 //**********************************************************************************************************************
1256 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1259 m->openOutputFile(goodFName, goodFile);
1261 ofstream badAccnosFile;
1262 m->openOutputFile(badAccnosFName, badAccnosFile);
1265 m->openInputFile(filename, inFASTA);
1267 inFASTA.seekg(filePos.start);
1274 if (m->control_pressed) { return 0; }
1276 Sequence currSeq(inFASTA); m->gobble(inFASTA);
1277 if (currSeq.getName() != "") {
1278 bool goodSeq = 1; // innocent until proven guilty
1279 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1280 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1281 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1282 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1283 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1284 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1287 currSeq.printSequence(goodFile);
1290 badAccnosFile << currSeq.getName() << endl;
1291 badSeqNames.insert(currSeq.getName());
1296 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1297 unsigned long long pos = inFASTA.tellg();
1298 if ((pos == -1) || (pos >= filePos.end)) { break; }
1300 if (inFASTA.eof()) { break; }
1304 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1307 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1312 badAccnosFile.close();
1316 catch(exception& e) {
1317 m->errorOut(e, "ScreenSeqsCommand", "driver");
1321 //**********************************************************************************************************************
1323 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
1325 string outputString = "";
1326 MPI_Status statusGood;
1327 MPI_Status statusBadAccnos;
1330 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1332 for(int i=0;i<num;i++){
1334 if (m->control_pressed) { return 0; }
1336 //read next sequence
1337 int length = MPIPos[start+i+1] - MPIPos[start+i];
1339 char* buf4 = new char[length];
1341 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1343 string tempBuf = buf4; delete buf4;
1344 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1345 istringstream iss (tempBuf,istringstream::in);
1347 Sequence currSeq(iss);
1350 if (currSeq.getName() != "") {
1351 bool goodSeq = 1; // innocent until proven guilty
1352 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1353 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1354 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1355 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1356 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1357 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1360 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1363 length = outputString.length();
1364 char* buf2 = new char[length];
1365 memcpy(buf2, outputString.c_str(), length);
1367 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1372 badSeqNames.insert(currSeq.getName());
1374 //write to bad accnos file
1375 outputString = currSeq.getName() + "\n";
1377 length = outputString.length();
1378 char* buf3 = new char[length];
1379 memcpy(buf3, outputString.c_str(), length);
1381 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1387 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1392 catch(exception& e) {
1393 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1398 /**************************************************************************************************/
1400 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1403 vector<int> processIDS;
1407 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1409 //loop through and create all the processes you want
1410 while (process != processors) {
1414 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1416 }else if (pid == 0){
1417 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1419 //pass numSeqs to parent
1421 string tempFile = filename + toString(getpid()) + ".num.temp";
1422 m->openOutputFile(tempFile, out);
1428 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1429 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1434 num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
1436 //force parent to wait until all the processes are done
1437 for (int i=0;i<processIDS.size();i++) {
1438 int temp = processIDS[i];
1442 for (int i = 0; i < processIDS.size(); i++) {
1444 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1445 m->openInputFile(tempFile, in);
1446 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1447 in.close(); m->mothurRemove(tempFile);
1449 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1450 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1452 m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1453 m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1456 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
1458 int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
1460 if (ableToOpen == 0) {
1461 badSeqNames.clear();
1463 while (!inBad.eof()) {
1464 inBad >> tempName; m->gobble(inBad);
1465 badSeqNames.insert(tempName);
1471 //////////////////////////////////////////////////////////////////////////////////////////////////////
1472 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
1473 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1474 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
1475 //////////////////////////////////////////////////////////////////////////////////////////////////////
1477 vector<sumScreenData*> pDataArray;
1478 DWORD dwThreadIdArray[processors-1];
1479 HANDLE hThreadArray[processors-1];
1481 //Create processor worker threads.
1482 for( int i=0; i<processors-1; i++ ){
1484 string extension = "";
1485 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
1487 // Allocate memory for thread data.
1488 sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
1489 pDataArray.push_back(tempSum);
1491 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1492 hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1496 num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
1497 processIDS.push_back(processors-1);
1499 //Wait until all threads have terminated.
1500 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1502 //Close all thread handles and free memory allocations.
1503 for(int i=0; i < pDataArray.size(); i++){
1504 num += pDataArray[i]->count;
1505 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
1506 CloseHandle(hThreadArray[i]);
1507 delete pDataArray[i];
1510 for (int i = 0; i < processIDS.size(); i++) {
1511 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1512 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1514 m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1515 m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1523 catch(exception& e) {
1524 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1529 //***************************************************************************************************************