2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
22 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
23 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
30 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
31 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34 vector<string> myArray;
35 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
39 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
43 //**********************************************************************************************************************
44 string ScreenSeqsCommand::getHelpString(){
46 string helpString = "";
47 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
48 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
49 helpString += "The fasta parameter is required.\n";
50 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
51 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
52 helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
53 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
54 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
55 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
56 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
57 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
58 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
59 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
60 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
61 helpString += "The screen.seqs command should be in the following format: \n";
62 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
63 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
64 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
65 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
69 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
73 //**********************************************************************************************************************
74 string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
76 string outputFileName = "";
77 map<string, vector<string> >::iterator it;
79 //is this a type this command creates
80 it = outputTypes.find(type);
81 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
83 if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); }
84 else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); }
85 else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); }
86 else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); }
87 else if (type == "accnos") { outputFileName = "bad.accnos"; }
88 else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); }
89 else if (type == "alignreport") { outputFileName = "good.align.report"; }
90 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
92 return outputFileName;
95 m->errorOut(e, "ScreenSeqsCommand", "getOutputFileNameTag");
100 //**********************************************************************************************************************
101 ScreenSeqsCommand::ScreenSeqsCommand(){
103 abort = true; calledHelp = true;
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["name"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
109 outputTypes["alignreport"] = tempOutNames;
110 outputTypes["accnos"] = tempOutNames;
111 outputTypes["qfile"] = tempOutNames;
112 outputTypes["taxonomy"] = tempOutNames;
114 catch(exception& e) {
115 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
119 //***************************************************************************************************************
121 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
123 abort = false; calledHelp = false;
125 //allow user to run help
126 if(option == "help") { help(); abort = true; calledHelp = true; }
127 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
130 vector<string> myArray = setParameters();
132 OptionParser parser(option);
133 map<string,string> parameters = parser.getParameters();
135 ValidParameters validParameter("screen.seqs");
136 map<string,string>::iterator it;
138 //check to make sure all parameters are valid for command
139 for (it = parameters.begin(); it != parameters.end(); it++) {
140 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
143 //initialize outputTypes
144 vector<string> tempOutNames;
145 outputTypes["fasta"] = tempOutNames;
146 outputTypes["name"] = tempOutNames;
147 outputTypes["group"] = tempOutNames;
148 outputTypes["alignreport"] = tempOutNames;
149 outputTypes["accnos"] = tempOutNames;
150 outputTypes["qfile"] = tempOutNames;
151 outputTypes["taxonomy"] = tempOutNames;
153 //if the user changes the input directory command factory will send this info to us in the output parameter
154 string inputDir = validParameter.validFile(parameters, "inputdir", false);
155 if (inputDir == "not found"){ inputDir = ""; }
158 it = parameters.find("fasta");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["fasta"] = inputDir + it->second; }
166 it = parameters.find("group");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["group"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
182 it = parameters.find("alignreport");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
190 it = parameters.find("qfile");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["qfile"] = inputDir + it->second; }
198 it = parameters.find("taxonomy");
199 //user has given a template file
200 if(it != parameters.end()){
201 path = m->hasPath(it->second);
202 //if the user has not given a path then, add inputdir. else leave path alone.
203 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
207 //check for required parameters
208 fastafile = validParameter.validFile(parameters, "fasta", true);
209 if (fastafile == "not found") {
210 fastafile = m->getFastaFile();
211 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
212 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
214 else if (fastafile == "not open") { abort = true; }
215 else { m->setFastaFile(fastafile); }
217 groupfile = validParameter.validFile(parameters, "group", true);
218 if (groupfile == "not open") { abort = true; }
219 else if (groupfile == "not found") { groupfile = ""; }
220 else { m->setGroupFile(groupfile); }
222 qualfile = validParameter.validFile(parameters, "qfile", true);
223 if (qualfile == "not open") { abort = true; }
224 else if (qualfile == "not found") { qualfile = ""; }
225 else { m->setQualFile(qualfile); }
227 namefile = validParameter.validFile(parameters, "name", true);
228 if (namefile == "not open") { namefile = ""; abort = true; }
229 else if (namefile == "not found") { namefile = ""; }
230 else { m->setNameFile(namefile); }
232 alignreport = validParameter.validFile(parameters, "alignreport", true);
233 if (alignreport == "not open") { abort = true; }
234 else if (alignreport == "not found") { alignreport = ""; }
236 taxonomy = validParameter.validFile(parameters, "taxonomy", true);
237 if (taxonomy == "not open") { abort = true; }
238 else if (taxonomy == "not found") { taxonomy = ""; }
240 //if the user changes the output directory command factory will send this info to us in the output parameter
241 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
243 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
246 //check for optional parameter and set defaults
247 // ...at some point should added some additional type checking...
249 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
250 m->mothurConvert(temp, startPos);
252 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
253 m->mothurConvert(temp, endPos);
255 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
256 m->mothurConvert(temp, maxAmbig);
258 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
259 m->mothurConvert(temp, maxHomoP);
261 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
262 m->mothurConvert(temp, minLength);
264 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
265 m->mothurConvert(temp, maxLength);
267 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
268 m->setProcessors(temp);
269 m->mothurConvert(temp, processors);
271 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
272 if (temp == "not found"){ temp = "none"; }
273 m->splitAtDash(temp, optimize);
275 //check for invalid optimize options
276 set<string> validOptimizers;
277 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
278 for (int i = 0; i < optimize.size(); i++) {
279 if (validOptimizers.count(optimize[i]) == 0) {
280 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
281 optimize.erase(optimize.begin()+i);
286 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
288 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
289 m->mothurConvert(temp, criteria);
291 if (namefile == "") {
292 vector<string> files; files.push_back(fastafile);
293 parser.getNameFile(files);
298 catch(exception& e) {
299 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
303 //***************************************************************************************************************
305 int ScreenSeqsCommand::execute(){
308 if (abort == true) { if (calledHelp) { return 0; } return 2; }
310 //if the user want to optimize we need to know the 90% mark
311 vector<unsigned long long> positions;
312 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
313 //use the namefile to optimize correctly
314 if (namefile != "") { nameMap = m->readNames(namefile); }
315 getSummary(positions);
318 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
319 positions = m->divideFile(fastafile, processors);
320 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
322 if(processors == 1){ lines.push_back(linePair(0, 1000)); }
324 int numFastaSeqs = 0;
325 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
326 if (positions.size() < processors) { processors = positions.size(); }
328 //figure out how many sequences you have to process
329 int numSeqsPerProcessor = numFastaSeqs / processors;
330 for (int i = 0; i < processors; i++) {
331 int startIndex = i * numSeqsPerProcessor;
332 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
333 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
339 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
340 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
342 int numFastaSeqs = 0;
343 set<string> badSeqNames;
344 int start = time(NULL);
347 int pid, numSeqsPerProcessor;
349 vector<unsigned long long> MPIPos;
352 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
353 MPI_Comm_size(MPI_COMM_WORLD, &processors);
357 MPI_File outMPIBadAccnos;
359 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
360 int inMode=MPI_MODE_RDONLY;
362 char outGoodFilename[1024];
363 strcpy(outGoodFilename, goodSeqFile.c_str());
365 char outBadAccnosFilename[1024];
366 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
368 char inFileName[1024];
369 strcpy(inFileName, fastafile.c_str());
371 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
372 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
373 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
375 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
377 if (pid == 0) { //you are the root process
379 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
381 //send file positions to all processes
382 for(int i = 1; i < processors; i++) {
383 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
384 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
387 //figure out how many sequences you have to align
388 numSeqsPerProcessor = numFastaSeqs / processors;
389 int startIndex = pid * numSeqsPerProcessor;
390 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
393 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
395 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
397 for (int i = 1; i < processors; i++) {
400 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
402 }else{ //you are a child process
403 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
404 MPIPos.resize(numFastaSeqs+1);
405 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
407 //figure out how many sequences you have to align
408 numSeqsPerProcessor = numFastaSeqs / processors;
409 int startIndex = pid * numSeqsPerProcessor;
410 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
413 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
415 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
418 int badSize = badSeqNames.size();
419 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
423 MPI_File_close(&inMPI);
424 MPI_File_close(&outMPIGood);
425 MPI_File_close(&outMPIBadAccnos);
426 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
429 if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
430 else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
432 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
436 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
438 if (pid == 0) { //only one process should fix files
440 //read accnos file with all names in it, process 0 just has its names
441 MPI_File inMPIAccnos;
444 char inFileName[1024];
445 strcpy(inFileName, badAccnosFile.c_str());
447 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
448 MPI_File_get_size(inMPIAccnos, &size);
450 char* buffer = new char[size];
451 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
453 string tempBuf = buffer;
454 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
455 istringstream iss (tempBuf,istringstream::in);
458 MPI_File_close(&inMPIAccnos);
463 iss >> tempName; m->gobble(iss);
464 badSeqNames.insert(tempName);
468 if(namefile != "" && groupfile != "") {
469 screenNameGroupFile(badSeqNames);
470 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
471 }else if(namefile != "") {
472 screenNameGroupFile(badSeqNames);
473 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
474 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
476 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
478 if(alignreport != "") { screenAlignReport(badSeqNames); }
479 if(qualfile != "") { screenQual(badSeqNames); }
480 if(taxonomy != "") { screenTaxonomy(badSeqNames); }
482 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
488 m->mothurOutEndLine();
489 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
490 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
491 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
492 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
493 m->mothurOutEndLine();
494 m->mothurOutEndLine();
496 //set fasta file as new current fastafile
498 itTypes = outputTypes.find("fasta");
499 if (itTypes != outputTypes.end()) {
500 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
503 itTypes = outputTypes.find("name");
504 if (itTypes != outputTypes.end()) {
505 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
508 itTypes = outputTypes.find("group");
509 if (itTypes != outputTypes.end()) {
510 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
513 itTypes = outputTypes.find("qfile");
514 if (itTypes != outputTypes.end()) {
515 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
518 itTypes = outputTypes.find("taxonomy");
519 if (itTypes != outputTypes.end()) {
520 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
523 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
524 m->mothurOutEndLine();
528 catch(exception& e) {
529 m->errorOut(e, "ScreenSeqsCommand", "execute");
534 //***************************************************************************************************************
536 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
539 m->openInputFile(namefile, inputNames);
540 set<string> badSeqGroups;
541 string seqName, seqList, group;
542 set<string>::iterator it;
544 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
545 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
547 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
549 while(!inputNames.eof()){
550 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
552 inputNames >> seqName >> seqList;
553 it = badSeqNames.find(seqName);
555 if(it != badSeqNames.end()){
556 badSeqNames.erase(it);
560 for(int i=0;i<seqList.length();i++){
561 if(seqList[i] == ','){
562 badSeqGroups.insert(seqList.substr(start,i-start));
566 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
570 goodNameOut << seqName << '\t' << seqList << endl;
572 m->gobble(inputNames);
577 //we were unable to remove some of the bad sequences
578 if (badSeqNames.size() != 0) {
579 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
580 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
581 m->mothurOutEndLine();
587 ifstream inputGroups;
588 m->openInputFile(groupfile, inputGroups);
590 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
591 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
593 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
595 while(!inputGroups.eof()){
596 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; }
598 inputGroups >> seqName >> group;
600 it = badSeqGroups.find(seqName);
602 if(it != badSeqGroups.end()){
603 badSeqGroups.erase(it);
606 goodGroupOut << seqName << '\t' << group << endl;
608 m->gobble(inputGroups);
611 goodGroupOut.close();
613 //we were unable to remove some of the bad sequences
614 if (badSeqGroups.size() != 0) {
615 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
616 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
617 m->mothurOutEndLine();
626 catch(exception& e) {
627 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
631 //***************************************************************************************************************
632 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
635 vector<int> startPosition;
636 vector<int> endPosition;
637 vector<int> seqLength;
638 vector<int> ambigBases;
639 vector<int> longHomoPolymer;
641 vector<unsigned long long> positions;
642 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
643 positions = m->divideFile(fastafile, processors);
644 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
646 if(processors == 1){ lines.push_back(linePair(0, 1000)); }
648 int numFastaSeqs = 0;
649 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
650 if (positions.size() < processors) { processors = positions.size(); }
652 //figure out how many sequences you have to process
653 int numSeqsPerProcessor = numFastaSeqs / processors;
654 for (int i = 0; i < processors; i++) {
655 int startIndex = i * numSeqsPerProcessor;
656 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
657 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
664 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
667 driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
670 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
672 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
674 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
677 if (m->control_pressed) { return 0; }
679 // numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
680 // if (m->control_pressed) { return 0; }
683 sort(startPosition.begin(), startPosition.end());
684 sort(endPosition.begin(), endPosition.end());
685 sort(seqLength.begin(), seqLength.end());
686 sort(ambigBases.begin(), ambigBases.end());
687 sort(longHomoPolymer.begin(), longHomoPolymer.end());
689 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
690 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
692 for (int i = 0; i < optimize.size(); i++) {
693 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
694 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
695 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
696 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
697 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
698 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
705 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
706 MPI_Comm_size(MPI_COMM_WORLD, &processors);
709 //send file positions to all processes
710 for(int i = 1; i < processors; i++) {
711 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
712 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
713 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
714 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
715 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
716 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
719 MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
720 MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
721 MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
722 MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
723 MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
724 MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
726 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
730 catch(exception& e) {
731 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
735 /**************************************************************************************/
736 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {
740 m->openInputFile(filename, in);
742 in.seekg(filePos.start);
749 if (m->control_pressed) { in.close(); return 1; }
751 Sequence current(in); m->gobble(in);
753 if (current.getName() != "") {
755 if (namefile != "") {
756 //make sure this sequence is in the namefile, else error
757 map<string, int>::iterator it = nameMap.find(current.getName());
759 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
760 else { num = it->second; }
763 //for each sequence this sequence represents
764 for (int i = 0; i < num; i++) {
765 startPosition.push_back(current.getStartPos());
766 endPosition.push_back(current.getEndPos());
767 seqLength.push_back(current.getNumBases());
768 ambigBases.push_back(current.getAmbigBases());
769 longHomoPolymer.push_back(current.getLongHomoPolymer());
774 //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
775 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
776 unsigned long long pos = in.tellg();
777 if ((pos == -1) || (pos >= filePos.end)) { break; }
779 if (in.eof()) { break; }
788 catch(exception& e) {
789 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
793 /**************************************************************************************************/
794 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
799 vector<int> processIDS;
801 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
803 //loop through and create all the processes you want
804 while (process != processors) {
808 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
811 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
813 //pass numSeqs to parent
815 string tempFile = fastafile + toString(getpid()) + ".num.temp";
816 m->openOutputFile(tempFile, out);
819 out << startPosition.size() << endl;
820 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
821 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
822 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
823 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
824 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
830 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
831 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
836 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
838 //force parent to wait until all the processes are done
839 for (int i=0;i<processIDS.size();i++) {
840 int temp = processIDS[i];
844 //parent reads in and combine Filter info
845 for (int i = 0; i < processIDS.size(); i++) {
846 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
848 m->openInputFile(tempFilename, in);
851 in >> tempNum; m->gobble(in); num += tempNum;
852 in >> tempNum; m->gobble(in);
853 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
854 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
855 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
856 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
857 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
860 m->mothurRemove(tempFilename);
865 //////////////////////////////////////////////////////////////////////////////////////////////////////
866 //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
867 //Above fork() will clone, so memory is separate, but that's not the case with windows,
868 //Taking advantage of shared memory to allow both threads to add info to vectors.
869 //////////////////////////////////////////////////////////////////////////////////////////////////////
871 vector<sumData*> pDataArray;
872 DWORD dwThreadIdArray[processors-1];
873 HANDLE hThreadArray[processors-1];
875 //Create processor worker threads.
876 for( int i=0; i<processors-1; i++ ){
878 // Allocate memory for thread data.
879 sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
880 pDataArray.push_back(tempSum);
882 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
883 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
884 hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
888 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
890 //Wait until all threads have terminated.
891 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
893 //Close all thread handles and free memory allocations.
894 for(int i=0; i < pDataArray.size(); i++){
895 num += pDataArray[i]->count;
896 for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
897 for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
898 for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
899 for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); }
900 for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); }
901 CloseHandle(hThreadArray[i]);
902 delete pDataArray[i];
908 catch(exception& e) {
909 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
914 //***************************************************************************************************************
916 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
918 ifstream inputGroups;
919 m->openInputFile(groupfile, inputGroups);
920 string seqName, group;
921 set<string>::iterator it;
923 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
924 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
925 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
927 while(!inputGroups.eof()){
928 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
930 inputGroups >> seqName >> group;
931 it = badSeqNames.find(seqName);
933 if(it != badSeqNames.end()){
934 badSeqNames.erase(it);
937 goodGroupOut << seqName << '\t' << group << endl;
939 m->gobble(inputGroups);
942 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
944 //we were unable to remove some of the bad sequences
945 if (badSeqNames.size() != 0) {
946 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
947 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
948 m->mothurOutEndLine();
953 goodGroupOut.close();
955 if (m->control_pressed) { m->mothurRemove(goodGroupFile); }
960 catch(exception& e) {
961 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
966 //***************************************************************************************************************
968 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
970 ifstream inputAlignReport;
971 m->openInputFile(alignreport, inputAlignReport);
972 string seqName, group;
973 set<string>::iterator it;
975 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + getOutputFileNameTag("alignreport");
976 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
977 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
979 while (!inputAlignReport.eof()) { // need to copy header
980 char c = inputAlignReport.get();
981 goodAlignReportOut << c;
982 if (c == 10 || c == 13){ break; }
985 while(!inputAlignReport.eof()){
986 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
988 inputAlignReport >> seqName;
989 it = badSeqNames.find(seqName);
991 while (!inputAlignReport.eof()) { // need to copy header
992 char c = inputAlignReport.get();
994 if (c == 10 || c == 13){ break; }
997 if(it != badSeqNames.end()){
998 badSeqNames.erase(it);
1001 goodAlignReportOut << seqName << '\t' << line;
1003 m->gobble(inputAlignReport);
1006 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
1008 //we were unable to remove some of the bad sequences
1009 if (badSeqNames.size() != 0) {
1010 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1011 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
1012 m->mothurOutEndLine();
1016 inputAlignReport.close();
1017 goodAlignReportOut.close();
1019 if (m->control_pressed) { m->mothurRemove(goodAlignReportFile); return 0; }
1024 catch(exception& e) {
1025 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
1030 //***************************************************************************************************************
1032 int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
1035 m->openInputFile(taxonomy, input);
1036 string seqName, tax;
1037 set<string>::iterator it;
1039 string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + getOutputFileNameTag("taxonomy", taxonomy);
1040 outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile);
1041 ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut);
1043 while(!input.eof()){
1044 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1046 input >> seqName >> tax;
1047 it = badSeqNames.find(seqName);
1049 if(it != badSeqNames.end()){ badSeqNames.erase(it); }
1051 goodTaxOut << seqName << '\t' << tax << endl;
1056 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1058 //we were unable to remove some of the bad sequences
1059 if (badSeqNames.size() != 0) {
1060 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1061 m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct.");
1062 m->mothurOutEndLine();
1069 if (m->control_pressed) { m->mothurRemove(goodTaxFile); return 0; }
1074 catch(exception& e) {
1075 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
1080 //***************************************************************************************************************
1082 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
1085 m->openInputFile(qualfile, in);
1086 set<string>::iterator it;
1088 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile);
1089 outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
1090 ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
1094 if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
1096 string saveName = "";
1102 if (name.length() != 0) {
1103 saveName = name.substr(1);
1106 if (c == 10 || c == 13){ break; }
1113 char letter= in.get();
1114 if(letter == '>'){ in.putback(letter); break; }
1115 else{ scores += letter; }
1120 it = badSeqNames.find(saveName);
1122 if(it != badSeqNames.end()){
1123 badSeqNames.erase(it);
1125 goodQual << name << endl << scores;
1134 //we were unable to remove some of the bad sequences
1135 if (badSeqNames.size() != 0) {
1136 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
1137 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
1138 m->mothurOutEndLine();
1142 if (m->control_pressed) { m->mothurRemove(goodQualFile); return 0; }
1147 catch(exception& e) {
1148 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1153 //**********************************************************************************************************************
1155 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1158 m->openOutputFile(goodFName, goodFile);
1160 ofstream badAccnosFile;
1161 m->openOutputFile(badAccnosFName, badAccnosFile);
1164 m->openInputFile(filename, inFASTA);
1166 inFASTA.seekg(filePos.start);
1173 if (m->control_pressed) { return 0; }
1175 Sequence currSeq(inFASTA); m->gobble(inFASTA);
1176 if (currSeq.getName() != "") {
1177 bool goodSeq = 1; // innocent until proven guilty
1178 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1179 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1180 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1181 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1182 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1183 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1186 currSeq.printSequence(goodFile);
1189 badAccnosFile << currSeq.getName() << endl;
1190 badSeqNames.insert(currSeq.getName());
1195 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1196 unsigned long long pos = inFASTA.tellg();
1197 if ((pos == -1) || (pos >= filePos.end)) { break; }
1199 if (inFASTA.eof()) { break; }
1203 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1206 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1211 badAccnosFile.close();
1215 catch(exception& e) {
1216 m->errorOut(e, "ScreenSeqsCommand", "driver");
1220 //**********************************************************************************************************************
1222 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
1224 string outputString = "";
1225 MPI_Status statusGood;
1226 MPI_Status statusBadAccnos;
1229 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1231 for(int i=0;i<num;i++){
1233 if (m->control_pressed) { return 0; }
1235 //read next sequence
1236 int length = MPIPos[start+i+1] - MPIPos[start+i];
1238 char* buf4 = new char[length];
1240 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1242 string tempBuf = buf4; delete buf4;
1243 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1244 istringstream iss (tempBuf,istringstream::in);
1246 Sequence currSeq(iss);
1249 if (currSeq.getName() != "") {
1250 bool goodSeq = 1; // innocent until proven guilty
1251 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1252 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1253 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1254 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1255 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1256 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1259 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1262 length = outputString.length();
1263 char* buf2 = new char[length];
1264 memcpy(buf2, outputString.c_str(), length);
1266 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1271 badSeqNames.insert(currSeq.getName());
1273 //write to bad accnos file
1274 outputString = currSeq.getName() + "\n";
1276 length = outputString.length();
1277 char* buf3 = new char[length];
1278 memcpy(buf3, outputString.c_str(), length);
1280 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1286 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1291 catch(exception& e) {
1292 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1297 /**************************************************************************************************/
1299 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1302 vector<int> processIDS;
1306 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1308 //loop through and create all the processes you want
1309 while (process != processors) {
1313 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1315 }else if (pid == 0){
1316 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1318 //pass numSeqs to parent
1320 string tempFile = filename + toString(getpid()) + ".num.temp";
1321 m->openOutputFile(tempFile, out);
1327 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1328 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1333 num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
1335 //force parent to wait until all the processes are done
1336 for (int i=0;i<processIDS.size();i++) {
1337 int temp = processIDS[i];
1341 for (int i = 0; i < processIDS.size(); i++) {
1343 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1344 m->openInputFile(tempFile, in);
1345 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1346 in.close(); m->mothurRemove(tempFile);
1348 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1349 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1351 m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1352 m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1355 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
1357 int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
1359 if (ableToOpen == 0) {
1360 badSeqNames.clear();
1362 while (!inBad.eof()) {
1363 inBad >> tempName; m->gobble(inBad);
1364 badSeqNames.insert(tempName);
1370 //////////////////////////////////////////////////////////////////////////////////////////////////////
1371 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
1372 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1373 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
1374 //////////////////////////////////////////////////////////////////////////////////////////////////////
1376 vector<sumScreenData*> pDataArray;
1377 DWORD dwThreadIdArray[processors-1];
1378 HANDLE hThreadArray[processors-1];
1380 //Create processor worker threads.
1381 for( int i=0; i<processors-1; i++ ){
1383 string extension = "";
1384 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
1386 // Allocate memory for thread data.
1387 sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
1388 pDataArray.push_back(tempSum);
1390 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1391 hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1395 num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
1396 processIDS.push_back(processors-1);
1398 //Wait until all threads have terminated.
1399 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1401 //Close all thread handles and free memory allocations.
1402 for(int i=0; i < pDataArray.size(); i++){
1403 num += pDataArray[i]->count;
1404 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
1405 CloseHandle(hThreadArray[i]);
1406 delete pDataArray[i];
1409 for (int i = 0; i < processIDS.size(); i++) {
1410 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1411 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1413 m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1414 m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1422 catch(exception& e) {
1423 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1428 //***************************************************************************************************************