2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
23 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
24 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
25 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
26 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
29 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
30 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
31 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33 vector<string> myArray;
34 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
38 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
42 //**********************************************************************************************************************
43 string ScreenSeqsCommand::getHelpString(){
45 string helpString = "";
46 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
47 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, optimize, criteria and processors.\n";
48 helpString += "The fasta parameter is required.\n";
49 helpString += "The start parameter .... The default is -1.\n";
50 helpString += "The end parameter .... The default is -1.\n";
51 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
52 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
53 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
54 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
55 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
56 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
57 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
58 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
59 helpString += "The screen.seqs command should be in the following format: \n";
60 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
61 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
62 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
63 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
67 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
71 //**********************************************************************************************************************
72 ScreenSeqsCommand::ScreenSeqsCommand(){
74 abort = true; calledHelp = true;
76 vector<string> tempOutNames;
77 outputTypes["fasta"] = tempOutNames;
78 outputTypes["name"] = tempOutNames;
79 outputTypes["group"] = tempOutNames;
80 outputTypes["alignreport"] = tempOutNames;
81 outputTypes["accnos"] = tempOutNames;
82 outputTypes["qfile"] = tempOutNames;
85 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
89 //***************************************************************************************************************
91 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
93 abort = false; calledHelp = false;
95 //allow user to run help
96 if(option == "help") { help(); abort = true; calledHelp = true; }
99 vector<string> myArray = setParameters();
101 OptionParser parser(option);
102 map<string,string> parameters = parser.getParameters();
104 ValidParameters validParameter("screen.seqs");
105 map<string,string>::iterator it;
107 //check to make sure all parameters are valid for command
108 for (it = parameters.begin(); it != parameters.end(); it++) {
109 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
112 //initialize outputTypes
113 vector<string> tempOutNames;
114 outputTypes["fasta"] = tempOutNames;
115 outputTypes["name"] = tempOutNames;
116 outputTypes["group"] = tempOutNames;
117 outputTypes["alignreport"] = tempOutNames;
118 outputTypes["accnos"] = tempOutNames;
119 outputTypes["qfile"] = tempOutNames;
121 //if the user changes the input directory command factory will send this info to us in the output parameter
122 string inputDir = validParameter.validFile(parameters, "inputdir", false);
123 if (inputDir == "not found"){ inputDir = ""; }
126 it = parameters.find("fasta");
127 //user has given a template file
128 if(it != parameters.end()){
129 path = m->hasPath(it->second);
130 //if the user has not given a path then, add inputdir. else leave path alone.
131 if (path == "") { parameters["fasta"] = inputDir + it->second; }
134 it = parameters.find("group");
135 //user has given a template file
136 if(it != parameters.end()){
137 path = m->hasPath(it->second);
138 //if the user has not given a path then, add inputdir. else leave path alone.
139 if (path == "") { parameters["group"] = inputDir + it->second; }
142 it = parameters.find("name");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["name"] = inputDir + it->second; }
150 it = parameters.find("alignreport");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
158 it = parameters.find("qfile");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["qfile"] = inputDir + it->second; }
168 //check for required parameters
169 fastafile = validParameter.validFile(parameters, "fasta", true);
170 if (fastafile == "not found") {
171 fastafile = m->getFastaFile();
172 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
173 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
175 else if (fastafile == "not open") { abort = true; }
177 groupfile = validParameter.validFile(parameters, "group", true);
178 if (groupfile == "not open") { abort = true; }
179 else if (groupfile == "not found") { groupfile = ""; }
181 qualfile = validParameter.validFile(parameters, "qfile", true);
182 if (qualfile == "not open") { abort = true; }
183 else if (qualfile == "not found") { qualfile = ""; }
185 namefile = validParameter.validFile(parameters, "name", true);
186 if (namefile == "not open") { namefile = ""; abort = true; }
187 else if (namefile == "not found") { namefile = ""; }
189 alignreport = validParameter.validFile(parameters, "alignreport", true);
190 if (alignreport == "not open") { abort = true; }
191 else if (alignreport == "not found") { alignreport = ""; }
193 //if the user changes the output directory command factory will send this info to us in the output parameter
194 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
196 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
199 //check for optional parameter and set defaults
200 // ...at some point should added some additional type checking...
202 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
203 convert(temp, startPos);
205 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
206 convert(temp, endPos);
208 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
209 convert(temp, maxAmbig);
211 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
212 convert(temp, maxHomoP);
214 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
215 convert(temp, minLength);
217 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
218 convert(temp, maxLength);
220 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
221 m->setProcessors(temp);
222 convert(temp, processors);
224 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
225 if (temp == "not found"){ temp = "none"; }
226 m->splitAtDash(temp, optimize);
228 //check for invalid optimize options
229 set<string> validOptimizers;
230 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
231 for (int i = 0; i < optimize.size(); i++) {
232 if (validOptimizers.count(optimize[i]) == 0) {
233 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
234 optimize.erase(optimize.begin()+i);
239 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
241 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
242 convert(temp, criteria);
246 catch(exception& e) {
247 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
251 //***************************************************************************************************************
253 int ScreenSeqsCommand::execute(){
256 if (abort == true) { if (calledHelp) { return 0; } return 2; }
258 //if the user want to optimize we need to know the 90% mark
259 vector<unsigned long int> positions;
260 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
261 //use the namefile to optimize correctly
262 if (namefile != "") { nameMap = m->readNames(namefile); }
263 getSummary(positions);
266 positions = m->divideFile(fastafile, processors);
267 for (int i = 0; i < (positions.size()-1); i++) {
268 lines.push_back(new linePair(positions[i], positions[(i+1)]));
272 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
273 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
275 int numFastaSeqs = 0;
276 set<string> badSeqNames;
277 int start = time(NULL);
280 int pid, numSeqsPerProcessor;
282 vector<unsigned long int> MPIPos;
285 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
286 MPI_Comm_size(MPI_COMM_WORLD, &processors);
290 MPI_File outMPIBadAccnos;
292 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
293 int inMode=MPI_MODE_RDONLY;
295 char outGoodFilename[1024];
296 strcpy(outGoodFilename, goodSeqFile.c_str());
298 char outBadAccnosFilename[1024];
299 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
301 char inFileName[1024];
302 strcpy(inFileName, fastafile.c_str());
304 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
305 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
306 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
308 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
310 if (pid == 0) { //you are the root process
312 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
314 //send file positions to all processes
315 for(int i = 1; i < processors; i++) {
316 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
317 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
320 //figure out how many sequences you have to align
321 numSeqsPerProcessor = numFastaSeqs / processors;
322 int startIndex = pid * numSeqsPerProcessor;
323 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
324 // cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
326 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
327 //cout << pid << " done" << endl;
328 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
330 for (int i = 1; i < processors; i++) {
334 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
335 /*for (int j = 0; j < badSize; j++) {
337 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); //recv the length of the name
338 char* buf2 = new char[length]; //make space to recieve it
339 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //get name
341 string tempBuf = buf2;
342 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
345 badSeqNames.insert(tempBuf);
348 }else{ //you are a child process
349 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
350 MPIPos.resize(numFastaSeqs+1);
351 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
353 //figure out how many sequences you have to align
354 numSeqsPerProcessor = numFastaSeqs / processors;
355 int startIndex = pid * numSeqsPerProcessor;
356 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
357 //cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
359 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
360 //cout << pid << " done" << endl;
361 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
364 int badSize = badSeqNames.size();
365 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
368 set<string>::iterator it;
369 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
371 int length = name.length();
372 char* buf2 = new char[length];
373 memcpy(buf2, name.c_str(), length);
375 MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
376 MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
381 MPI_File_close(&inMPI);
382 MPI_File_close(&outMPIGood);
383 MPI_File_close(&outMPIBadAccnos);
384 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
388 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
390 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
392 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
395 processIDS.resize(0);
397 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
399 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
400 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
402 //append alignment and report files
403 for(int i=1;i<processors;i++){
404 m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
405 remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
407 m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
408 remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
411 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
413 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
415 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
417 if (ableToOpen == 0) {
420 while (!inBad.eof()) {
421 inBad >> tempName; m->gobble(inBad);
422 badSeqNames.insert(tempName);
428 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
430 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
437 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
439 if (pid == 0) { //only one process should fix files
441 //read accnos file with all names in it, process 0 just has its names
442 MPI_File inMPIAccnos;
445 char inFileName[1024];
446 strcpy(inFileName, badAccnosFile.c_str());
448 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
449 MPI_File_get_size(inMPIAccnos, &size);
451 char* buffer = new char[size];
452 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
454 string tempBuf = buffer;
455 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
456 istringstream iss (tempBuf,istringstream::in);
459 MPI_File_close(&inMPIAccnos);
464 iss >> tempName; m->gobble(iss);
465 badSeqNames.insert(tempName);
469 if(namefile != "" && groupfile != "") {
470 screenNameGroupFile(badSeqNames);
471 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
472 }else if(namefile != "") {
473 screenNameGroupFile(badSeqNames);
474 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
475 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
477 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
479 if(alignreport != "") { screenAlignReport(badSeqNames); }
480 if(qualfile != "") { screenQual(badSeqNames); }
482 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
488 m->mothurOutEndLine();
489 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
490 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
491 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
492 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
493 m->mothurOutEndLine();
494 m->mothurOutEndLine();
496 //set fasta file as new current fastafile
498 itTypes = outputTypes.find("fasta");
499 if (itTypes != outputTypes.end()) {
500 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
503 itTypes = outputTypes.find("name");
504 if (itTypes != outputTypes.end()) {
505 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
508 itTypes = outputTypes.find("group");
509 if (itTypes != outputTypes.end()) {
510 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
513 itTypes = outputTypes.find("qfile");
514 if (itTypes != outputTypes.end()) {
515 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
518 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
519 m->mothurOutEndLine();
523 catch(exception& e) {
524 m->errorOut(e, "ScreenSeqsCommand", "execute");
529 //***************************************************************************************************************
531 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
534 m->openInputFile(namefile, inputNames);
535 set<string> badSeqGroups;
536 string seqName, seqList, group;
537 set<string>::iterator it;
539 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
540 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
542 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
544 while(!inputNames.eof()){
545 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); return 0; }
547 inputNames >> seqName >> seqList;
548 it = badSeqNames.find(seqName);
550 if(it != badSeqNames.end()){
551 badSeqNames.erase(it);
555 for(int i=0;i<seqList.length();i++){
556 if(seqList[i] == ','){
557 badSeqGroups.insert(seqList.substr(start,i-start));
561 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
565 goodNameOut << seqName << '\t' << seqList << endl;
567 m->gobble(inputNames);
572 //we were unable to remove some of the bad sequences
573 if (badSeqNames.size() != 0) {
574 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
575 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
576 m->mothurOutEndLine();
582 ifstream inputGroups;
583 m->openInputFile(groupfile, inputGroups);
585 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
586 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
588 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
590 while(!inputGroups.eof()){
591 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(goodGroupFile.c_str()); return 0; }
593 inputGroups >> seqName >> group;
595 it = badSeqGroups.find(seqName);
597 if(it != badSeqGroups.end()){
598 badSeqGroups.erase(it);
601 goodGroupOut << seqName << '\t' << group << endl;
603 m->gobble(inputGroups);
606 goodGroupOut.close();
608 //we were unable to remove some of the bad sequences
609 if (badSeqGroups.size() != 0) {
610 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
611 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
612 m->mothurOutEndLine();
621 catch(exception& e) {
622 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
626 //***************************************************************************************************************
627 int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
630 vector<int> startPosition;
631 vector<int> endPosition;
632 vector<int> seqLength;
633 vector<int> ambigBases;
634 vector<int> longHomoPolymer;
636 vector<unsigned long int> positions = m->divideFile(fastafile, processors);
638 for (int i = 0; i < (positions.size()-1); i++) {
639 lines.push_back(new linePair(positions[i], positions[(i+1)]));
643 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
645 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
647 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
650 if (m->control_pressed) { return 0; }
652 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
653 if (m->control_pressed) { return 0; }
656 sort(startPosition.begin(), startPosition.end());
657 sort(endPosition.begin(), endPosition.end());
658 sort(seqLength.begin(), seqLength.end());
659 sort(ambigBases.begin(), ambigBases.end());
660 sort(longHomoPolymer.begin(), longHomoPolymer.end());
662 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
663 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
665 for (int i = 0; i < optimize.size(); i++) {
666 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
667 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
668 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
669 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
670 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
671 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
676 catch(exception& e) {
677 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
681 /**************************************************************************************/
682 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {
686 m->openInputFile(filename, in);
688 in.seekg(filePos->start);
695 if (m->control_pressed) { in.close(); return 1; }
697 Sequence current(in); m->gobble(in);
699 if (current.getName() != "") {
701 if (namefile != "") {
702 //make sure this sequence is in the namefile, else error
703 map<string, int>::iterator it = nameMap.find(current.getName());
705 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
706 else { num = it->second; }
709 //for each sequence this sequence represents
710 for (int i = 0; i < num; i++) {
711 startPosition.push_back(current.getStartPos());
712 endPosition.push_back(current.getEndPos());
713 seqLength.push_back(current.getNumBases());
714 ambigBases.push_back(current.getAmbigBases());
715 longHomoPolymer.push_back(current.getLongHomoPolymer());
721 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
722 unsigned long int pos = in.tellg();
723 if ((pos == -1) || (pos >= filePos->end)) { break; }
725 if (in.eof()) { break; }
734 catch(exception& e) {
735 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
739 /**************************************************************************************************/
740 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
742 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
747 //loop through and create all the processes you want
748 while (process != processors) {
752 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
755 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
757 //pass numSeqs to parent
759 string tempFile = fastafile + toString(getpid()) + ".num.temp";
760 m->openOutputFile(tempFile, out);
763 out << startPosition.size() << endl;
764 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
765 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
766 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
767 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
768 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
774 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
775 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
780 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
782 //force parent to wait until all the processes are done
783 for (int i=0;i<processIDS.size();i++) {
784 int temp = processIDS[i];
788 //parent reads in and combine Filter info
789 for (int i = 0; i < processIDS.size(); i++) {
790 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
792 m->openInputFile(tempFilename, in);
795 in >> tempNum; m->gobble(in); num += tempNum;
796 in >> tempNum; m->gobble(in);
797 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
798 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
799 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
800 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
801 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
804 remove(tempFilename.c_str());
810 catch(exception& e) {
811 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
816 //***************************************************************************************************************
818 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
820 ifstream inputGroups;
821 m->openInputFile(groupfile, inputGroups);
822 string seqName, group;
823 set<string>::iterator it;
825 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
826 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
827 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
829 while(!inputGroups.eof()){
830 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
832 inputGroups >> seqName >> group;
833 it = badSeqNames.find(seqName);
835 if(it != badSeqNames.end()){
836 badSeqNames.erase(it);
839 goodGroupOut << seqName << '\t' << group << endl;
841 m->gobble(inputGroups);
844 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
846 //we were unable to remove some of the bad sequences
847 if (badSeqNames.size() != 0) {
848 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
849 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
850 m->mothurOutEndLine();
855 goodGroupOut.close();
857 if (m->control_pressed) { remove(goodGroupFile.c_str()); }
862 catch(exception& e) {
863 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
868 //***************************************************************************************************************
870 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
872 ifstream inputAlignReport;
873 m->openInputFile(alignreport, inputAlignReport);
874 string seqName, group;
875 set<string>::iterator it;
877 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
878 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
879 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
881 while (!inputAlignReport.eof()) { // need to copy header
882 char c = inputAlignReport.get();
883 goodAlignReportOut << c;
884 if (c == 10 || c == 13){ break; }
887 while(!inputAlignReport.eof()){
888 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
890 inputAlignReport >> seqName;
891 it = badSeqNames.find(seqName);
893 while (!inputAlignReport.eof()) { // need to copy header
894 char c = inputAlignReport.get();
896 if (c == 10 || c == 13){ break; }
899 if(it != badSeqNames.end()){
900 badSeqNames.erase(it);
903 goodAlignReportOut << seqName << '\t' << line;
905 m->gobble(inputAlignReport);
908 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
910 //we were unable to remove some of the bad sequences
911 if (badSeqNames.size() != 0) {
912 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
913 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
914 m->mothurOutEndLine();
918 inputAlignReport.close();
919 goodAlignReportOut.close();
921 if (m->control_pressed) { remove(goodAlignReportFile.c_str()); return 0; }
926 catch(exception& e) {
927 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
932 //***************************************************************************************************************
934 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
937 m->openInputFile(qualfile, in);
938 set<string>::iterator it;
940 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
941 outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
942 ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
946 if (m->control_pressed) { goodQual.close(); in.close(); remove(goodQualFile.c_str()); return 0; }
948 string saveName = "";
954 if (name.length() != 0) {
955 saveName = name.substr(1);
958 if (c == 10 || c == 13){ break; }
965 char letter= in.get();
966 if(letter == '>'){ in.putback(letter); break; }
967 else{ scores += letter; }
972 it = badSeqNames.find(saveName);
974 if(it != badSeqNames.end()){
975 badSeqNames.erase(it);
977 goodQual << name << endl << scores;
986 //we were unable to remove some of the bad sequences
987 if (badSeqNames.size() != 0) {
988 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
989 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
990 m->mothurOutEndLine();
994 if (m->control_pressed) { remove(goodQualFile.c_str()); return 0; }
999 catch(exception& e) {
1000 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1005 //**********************************************************************************************************************
1007 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1010 m->openOutputFile(goodFName, goodFile);
1012 ofstream badAccnosFile;
1013 m->openOutputFile(badAccnosFName, badAccnosFile);
1016 m->openInputFile(filename, inFASTA);
1018 inFASTA.seekg(filePos->start);
1025 if (m->control_pressed) { return 0; }
1027 Sequence currSeq(inFASTA); m->gobble(inFASTA);
1028 if (currSeq.getName() != "") {
1029 bool goodSeq = 1; // innocent until proven guilty
1030 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1031 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1032 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1033 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1034 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1035 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1038 currSeq.printSequence(goodFile);
1041 badAccnosFile << currSeq.getName() << endl;
1042 badSeqNames.insert(currSeq.getName());
1047 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1048 unsigned long int pos = inFASTA.tellg();
1049 if ((pos == -1) || (pos >= filePos->end)) { break; }
1051 if (inFASTA.eof()) { break; }
1055 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1058 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1063 badAccnosFile.close();
1067 catch(exception& e) {
1068 m->errorOut(e, "ScreenSeqsCommand", "driver");
1072 //**********************************************************************************************************************
1074 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
1076 string outputString = "";
1077 MPI_Status statusGood;
1078 MPI_Status statusBadAccnos;
1081 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1083 for(int i=0;i<num;i++){
1085 if (m->control_pressed) { return 0; }
1087 //read next sequence
1088 int length = MPIPos[start+i+1] - MPIPos[start+i];
1090 char* buf4 = new char[length];
1091 memcpy(buf4, outputString.c_str(), length);
1093 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1095 string tempBuf = buf4; delete buf4;
1096 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1097 istringstream iss (tempBuf,istringstream::in);
1099 Sequence currSeq(iss);
1102 if (currSeq.getName() != "") {
1103 bool goodSeq = 1; // innocent until proven guilty
1104 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1105 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1106 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1107 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1108 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1109 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1112 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1115 length = outputString.length();
1116 char* buf2 = new char[length];
1117 memcpy(buf2, outputString.c_str(), length);
1119 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1124 badSeqNames.insert(currSeq.getName());
1126 //write to bad accnos file
1127 outputString = currSeq.getName() + "\n";
1129 length = outputString.length();
1130 char* buf3 = new char[length];
1131 memcpy(buf3, outputString.c_str(), length);
1133 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1139 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1144 catch(exception& e) {
1145 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1150 /**************************************************************************************************/
1152 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1154 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1158 //loop through and create all the processes you want
1159 while (process != processors) {
1163 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1165 }else if (pid == 0){
1166 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1168 //pass numSeqs to parent
1170 string tempFile = filename + toString(getpid()) + ".num.temp";
1171 m->openOutputFile(tempFile, out);
1177 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1178 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1183 //force parent to wait until all the processes are done
1184 for (int i=0;i<processors;i++) {
1185 int temp = processIDS[i];
1189 for (int i = 0; i < processIDS.size(); i++) {
1191 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1192 m->openInputFile(tempFile, in);
1193 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1194 in.close(); remove(tempFile.c_str());
1200 catch(exception& e) {
1201 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1206 //***************************************************************************************************************