2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
23 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
24 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
25 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
26 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
29 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
30 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
31 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33 vector<string> myArray;
34 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
38 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
42 //**********************************************************************************************************************
43 string ScreenSeqsCommand::getHelpString(){
45 string helpString = "";
46 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
47 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, optimize, criteria and processors.\n";
48 helpString += "The fasta parameter is required.\n";
49 helpString += "The start parameter .... The default is -1.\n";
50 helpString += "The end parameter .... The default is -1.\n";
51 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
52 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
53 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
54 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
55 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
56 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
57 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
58 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
59 helpString += "The screen.seqs command should be in the following format: \n";
60 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
61 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
62 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
63 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
67 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
71 //**********************************************************************************************************************
72 ScreenSeqsCommand::ScreenSeqsCommand(){
74 abort = true; calledHelp = true;
76 vector<string> tempOutNames;
77 outputTypes["fasta"] = tempOutNames;
78 outputTypes["name"] = tempOutNames;
79 outputTypes["group"] = tempOutNames;
80 outputTypes["alignreport"] = tempOutNames;
81 outputTypes["accnos"] = tempOutNames;
82 outputTypes["qfile"] = tempOutNames;
85 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
89 //***************************************************************************************************************
91 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
93 abort = false; calledHelp = false;
95 //allow user to run help
96 if(option == "help") { help(); abort = true; calledHelp = true; }
97 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
100 vector<string> myArray = setParameters();
102 OptionParser parser(option);
103 map<string,string> parameters = parser.getParameters();
105 ValidParameters validParameter("screen.seqs");
106 map<string,string>::iterator it;
108 //check to make sure all parameters are valid for command
109 for (it = parameters.begin(); it != parameters.end(); it++) {
110 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
113 //initialize outputTypes
114 vector<string> tempOutNames;
115 outputTypes["fasta"] = tempOutNames;
116 outputTypes["name"] = tempOutNames;
117 outputTypes["group"] = tempOutNames;
118 outputTypes["alignreport"] = tempOutNames;
119 outputTypes["accnos"] = tempOutNames;
120 outputTypes["qfile"] = tempOutNames;
122 //if the user changes the input directory command factory will send this info to us in the output parameter
123 string inputDir = validParameter.validFile(parameters, "inputdir", false);
124 if (inputDir == "not found"){ inputDir = ""; }
127 it = parameters.find("fasta");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["fasta"] = inputDir + it->second; }
135 it = parameters.find("group");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["group"] = inputDir + it->second; }
143 it = parameters.find("name");
144 //user has given a template file
145 if(it != parameters.end()){
146 path = m->hasPath(it->second);
147 //if the user has not given a path then, add inputdir. else leave path alone.
148 if (path == "") { parameters["name"] = inputDir + it->second; }
151 it = parameters.find("alignreport");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
159 it = parameters.find("qfile");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["qfile"] = inputDir + it->second; }
169 //check for required parameters
170 fastafile = validParameter.validFile(parameters, "fasta", true);
171 if (fastafile == "not found") {
172 fastafile = m->getFastaFile();
173 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
174 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
176 else if (fastafile == "not open") { abort = true; }
177 else { m->setFastaFile(fastafile); }
179 groupfile = validParameter.validFile(parameters, "group", true);
180 if (groupfile == "not open") { abort = true; }
181 else if (groupfile == "not found") { groupfile = ""; }
182 else { m->setGroupFile(groupfile); }
184 qualfile = validParameter.validFile(parameters, "qfile", true);
185 if (qualfile == "not open") { abort = true; }
186 else if (qualfile == "not found") { qualfile = ""; }
187 else { m->setQualFile(qualfile); }
189 namefile = validParameter.validFile(parameters, "name", true);
190 if (namefile == "not open") { namefile = ""; abort = true; }
191 else if (namefile == "not found") { namefile = ""; }
192 else { m->setNameFile(namefile); }
194 alignreport = validParameter.validFile(parameters, "alignreport", true);
195 if (alignreport == "not open") { abort = true; }
196 else if (alignreport == "not found") { alignreport = ""; }
198 //if the user changes the output directory command factory will send this info to us in the output parameter
199 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
201 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
204 //check for optional parameter and set defaults
205 // ...at some point should added some additional type checking...
207 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
208 convert(temp, startPos);
210 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
211 convert(temp, endPos);
213 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
214 convert(temp, maxAmbig);
216 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
217 convert(temp, maxHomoP);
219 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
220 convert(temp, minLength);
222 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
223 convert(temp, maxLength);
225 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
226 m->setProcessors(temp);
227 convert(temp, processors);
229 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
230 if (temp == "not found"){ temp = "none"; }
231 m->splitAtDash(temp, optimize);
233 //check for invalid optimize options
234 set<string> validOptimizers;
235 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
236 for (int i = 0; i < optimize.size(); i++) {
237 if (validOptimizers.count(optimize[i]) == 0) {
238 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
239 optimize.erase(optimize.begin()+i);
244 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
246 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
247 convert(temp, criteria);
251 catch(exception& e) {
252 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
256 //***************************************************************************************************************
258 int ScreenSeqsCommand::execute(){
261 if (abort == true) { if (calledHelp) { return 0; } return 2; }
263 //if the user want to optimize we need to know the 90% mark
264 vector<unsigned long int> positions;
265 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
266 //use the namefile to optimize correctly
267 if (namefile != "") { nameMap = m->readNames(namefile); }
268 getSummary(positions);
271 positions = m->divideFile(fastafile, processors);
272 for (int i = 0; i < (positions.size()-1); i++) {
273 lines.push_back(new linePair(positions[i], positions[(i+1)]));
277 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
278 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
280 int numFastaSeqs = 0;
281 set<string> badSeqNames;
282 int start = time(NULL);
285 int pid, numSeqsPerProcessor;
287 vector<unsigned long int> MPIPos;
290 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
291 MPI_Comm_size(MPI_COMM_WORLD, &processors);
295 MPI_File outMPIBadAccnos;
297 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
298 int inMode=MPI_MODE_RDONLY;
300 char outGoodFilename[1024];
301 strcpy(outGoodFilename, goodSeqFile.c_str());
303 char outBadAccnosFilename[1024];
304 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
306 char inFileName[1024];
307 strcpy(inFileName, fastafile.c_str());
309 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
310 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
311 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
313 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
315 if (pid == 0) { //you are the root process
317 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
319 //send file positions to all processes
320 for(int i = 1; i < processors; i++) {
321 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
322 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
325 //figure out how many sequences you have to align
326 numSeqsPerProcessor = numFastaSeqs / processors;
327 int startIndex = pid * numSeqsPerProcessor;
328 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
329 // cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
331 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
332 //cout << pid << " done" << endl;
333 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
335 for (int i = 1; i < processors; i++) {
339 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
340 /*for (int j = 0; j < badSize; j++) {
342 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); //recv the length of the name
343 char* buf2 = new char[length]; //make space to recieve it
344 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //get name
346 string tempBuf = buf2;
347 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
350 badSeqNames.insert(tempBuf);
353 }else{ //you are a child process
354 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
355 MPIPos.resize(numFastaSeqs+1);
356 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
358 //figure out how many sequences you have to align
359 numSeqsPerProcessor = numFastaSeqs / processors;
360 int startIndex = pid * numSeqsPerProcessor;
361 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
362 //cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
364 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
365 //cout << pid << " done" << endl;
366 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
369 int badSize = badSeqNames.size();
370 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
373 set<string>::iterator it;
374 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
376 int length = name.length();
377 char* buf2 = new char[length];
378 memcpy(buf2, name.c_str(), length);
380 MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
381 MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
386 MPI_File_close(&inMPI);
387 MPI_File_close(&outMPIGood);
388 MPI_File_close(&outMPIBadAccnos);
389 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
393 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
395 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
397 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
400 processIDS.resize(0);
402 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
404 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
405 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
407 //append alignment and report files
408 for(int i=1;i<processors;i++){
409 m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
410 m->mothurRemove((goodSeqFile + toString(processIDS[i]) + ".temp"));
412 m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
413 m->mothurRemove((badAccnosFile + toString(processIDS[i]) + ".temp"));
416 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
418 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
420 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
422 if (ableToOpen == 0) {
425 while (!inBad.eof()) {
426 inBad >> tempName; m->gobble(inBad);
427 badSeqNames.insert(tempName);
433 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
435 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
442 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
444 if (pid == 0) { //only one process should fix files
446 //read accnos file with all names in it, process 0 just has its names
447 MPI_File inMPIAccnos;
450 char inFileName[1024];
451 strcpy(inFileName, badAccnosFile.c_str());
453 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
454 MPI_File_get_size(inMPIAccnos, &size);
456 char* buffer = new char[size];
457 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
459 string tempBuf = buffer;
460 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
461 istringstream iss (tempBuf,istringstream::in);
464 MPI_File_close(&inMPIAccnos);
469 iss >> tempName; m->gobble(iss);
470 badSeqNames.insert(tempName);
474 if(namefile != "" && groupfile != "") {
475 screenNameGroupFile(badSeqNames);
476 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
477 }else if(namefile != "") {
478 screenNameGroupFile(badSeqNames);
479 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
480 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
482 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
484 if(alignreport != "") { screenAlignReport(badSeqNames); }
485 if(qualfile != "") { screenQual(badSeqNames); }
487 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
493 m->mothurOutEndLine();
494 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
495 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
496 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
497 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
498 m->mothurOutEndLine();
499 m->mothurOutEndLine();
501 //set fasta file as new current fastafile
503 itTypes = outputTypes.find("fasta");
504 if (itTypes != outputTypes.end()) {
505 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
508 itTypes = outputTypes.find("name");
509 if (itTypes != outputTypes.end()) {
510 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
513 itTypes = outputTypes.find("group");
514 if (itTypes != outputTypes.end()) {
515 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
518 itTypes = outputTypes.find("qfile");
519 if (itTypes != outputTypes.end()) {
520 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
523 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
524 m->mothurOutEndLine();
528 catch(exception& e) {
529 m->errorOut(e, "ScreenSeqsCommand", "execute");
534 //***************************************************************************************************************
536 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
539 m->openInputFile(namefile, inputNames);
540 set<string> badSeqGroups;
541 string seqName, seqList, group;
542 set<string>::iterator it;
544 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
545 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
547 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
549 while(!inputNames.eof()){
550 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; }
552 inputNames >> seqName >> seqList;
553 it = badSeqNames.find(seqName);
555 if(it != badSeqNames.end()){
556 badSeqNames.erase(it);
560 for(int i=0;i<seqList.length();i++){
561 if(seqList[i] == ','){
562 badSeqGroups.insert(seqList.substr(start,i-start));
566 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
570 goodNameOut << seqName << '\t' << seqList << endl;
572 m->gobble(inputNames);
577 //we were unable to remove some of the bad sequences
578 if (badSeqNames.size() != 0) {
579 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
580 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
581 m->mothurOutEndLine();
587 ifstream inputGroups;
588 m->openInputFile(groupfile, inputGroups);
590 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
591 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
593 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
595 while(!inputGroups.eof()){
596 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; }
598 inputGroups >> seqName >> group;
600 it = badSeqGroups.find(seqName);
602 if(it != badSeqGroups.end()){
603 badSeqGroups.erase(it);
606 goodGroupOut << seqName << '\t' << group << endl;
608 m->gobble(inputGroups);
611 goodGroupOut.close();
613 //we were unable to remove some of the bad sequences
614 if (badSeqGroups.size() != 0) {
615 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
616 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
617 m->mothurOutEndLine();
626 catch(exception& e) {
627 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
631 //***************************************************************************************************************
632 int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
635 vector<int> startPosition;
636 vector<int> endPosition;
637 vector<int> seqLength;
638 vector<int> ambigBases;
639 vector<int> longHomoPolymer;
641 vector<unsigned long int> positions = m->divideFile(fastafile, processors);
643 for (int i = 0; i < (positions.size()-1); i++) {
644 lines.push_back(new linePair(positions[i], positions[(i+1)]));
648 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
650 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
652 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
655 if (m->control_pressed) { return 0; }
657 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
658 if (m->control_pressed) { return 0; }
661 sort(startPosition.begin(), startPosition.end());
662 sort(endPosition.begin(), endPosition.end());
663 sort(seqLength.begin(), seqLength.end());
664 sort(ambigBases.begin(), ambigBases.end());
665 sort(longHomoPolymer.begin(), longHomoPolymer.end());
667 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
668 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
670 for (int i = 0; i < optimize.size(); i++) {
671 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
672 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
673 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
674 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
675 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
676 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
681 catch(exception& e) {
682 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
686 /**************************************************************************************/
687 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {
691 m->openInputFile(filename, in);
693 in.seekg(filePos->start);
700 if (m->control_pressed) { in.close(); return 1; }
702 Sequence current(in); m->gobble(in);
704 if (current.getName() != "") {
706 if (namefile != "") {
707 //make sure this sequence is in the namefile, else error
708 map<string, int>::iterator it = nameMap.find(current.getName());
710 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
711 else { num = it->second; }
714 //for each sequence this sequence represents
715 for (int i = 0; i < num; i++) {
716 startPosition.push_back(current.getStartPos());
717 endPosition.push_back(current.getEndPos());
718 seqLength.push_back(current.getNumBases());
719 ambigBases.push_back(current.getAmbigBases());
720 longHomoPolymer.push_back(current.getLongHomoPolymer());
726 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
727 unsigned long int pos = in.tellg();
728 if ((pos == -1) || (pos >= filePos->end)) { break; }
730 if (in.eof()) { break; }
739 catch(exception& e) {
740 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
744 /**************************************************************************************************/
745 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
747 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
752 //loop through and create all the processes you want
753 while (process != processors) {
757 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
760 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
762 //pass numSeqs to parent
764 string tempFile = fastafile + toString(getpid()) + ".num.temp";
765 m->openOutputFile(tempFile, out);
768 out << startPosition.size() << endl;
769 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
770 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
771 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
772 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
773 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
779 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
780 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
785 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
787 //force parent to wait until all the processes are done
788 for (int i=0;i<processIDS.size();i++) {
789 int temp = processIDS[i];
793 //parent reads in and combine Filter info
794 for (int i = 0; i < processIDS.size(); i++) {
795 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
797 m->openInputFile(tempFilename, in);
800 in >> tempNum; m->gobble(in); num += tempNum;
801 in >> tempNum; m->gobble(in);
802 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
803 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
804 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
805 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
806 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
809 m->mothurRemove(tempFilename);
815 catch(exception& e) {
816 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
821 //***************************************************************************************************************
823 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
825 ifstream inputGroups;
826 m->openInputFile(groupfile, inputGroups);
827 string seqName, group;
828 set<string>::iterator it;
830 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
831 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
832 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
834 while(!inputGroups.eof()){
835 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
837 inputGroups >> seqName >> group;
838 it = badSeqNames.find(seqName);
840 if(it != badSeqNames.end()){
841 badSeqNames.erase(it);
844 goodGroupOut << seqName << '\t' << group << endl;
846 m->gobble(inputGroups);
849 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
851 //we were unable to remove some of the bad sequences
852 if (badSeqNames.size() != 0) {
853 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
854 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
855 m->mothurOutEndLine();
860 goodGroupOut.close();
862 if (m->control_pressed) { m->mothurRemove(goodGroupFile); }
867 catch(exception& e) {
868 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
873 //***************************************************************************************************************
875 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
877 ifstream inputAlignReport;
878 m->openInputFile(alignreport, inputAlignReport);
879 string seqName, group;
880 set<string>::iterator it;
882 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
883 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
884 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
886 while (!inputAlignReport.eof()) { // need to copy header
887 char c = inputAlignReport.get();
888 goodAlignReportOut << c;
889 if (c == 10 || c == 13){ break; }
892 while(!inputAlignReport.eof()){
893 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
895 inputAlignReport >> seqName;
896 it = badSeqNames.find(seqName);
898 while (!inputAlignReport.eof()) { // need to copy header
899 char c = inputAlignReport.get();
901 if (c == 10 || c == 13){ break; }
904 if(it != badSeqNames.end()){
905 badSeqNames.erase(it);
908 goodAlignReportOut << seqName << '\t' << line;
910 m->gobble(inputAlignReport);
913 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
915 //we were unable to remove some of the bad sequences
916 if (badSeqNames.size() != 0) {
917 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
918 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
919 m->mothurOutEndLine();
923 inputAlignReport.close();
924 goodAlignReportOut.close();
926 if (m->control_pressed) { m->mothurRemove(goodAlignReportFile); return 0; }
931 catch(exception& e) {
932 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
937 //***************************************************************************************************************
939 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
942 m->openInputFile(qualfile, in);
943 set<string>::iterator it;
945 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
946 outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
947 ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
951 if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
953 string saveName = "";
959 if (name.length() != 0) {
960 saveName = name.substr(1);
963 if (c == 10 || c == 13){ break; }
970 char letter= in.get();
971 if(letter == '>'){ in.putback(letter); break; }
972 else{ scores += letter; }
977 it = badSeqNames.find(saveName);
979 if(it != badSeqNames.end()){
980 badSeqNames.erase(it);
982 goodQual << name << endl << scores;
991 //we were unable to remove some of the bad sequences
992 if (badSeqNames.size() != 0) {
993 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
994 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
995 m->mothurOutEndLine();
999 if (m->control_pressed) { m->mothurRemove(goodQualFile); return 0; }
1004 catch(exception& e) {
1005 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1010 //**********************************************************************************************************************
1012 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1015 m->openOutputFile(goodFName, goodFile);
1017 ofstream badAccnosFile;
1018 m->openOutputFile(badAccnosFName, badAccnosFile);
1021 m->openInputFile(filename, inFASTA);
1023 inFASTA.seekg(filePos->start);
1030 if (m->control_pressed) { return 0; }
1032 Sequence currSeq(inFASTA); m->gobble(inFASTA);
1033 if (currSeq.getName() != "") {
1034 bool goodSeq = 1; // innocent until proven guilty
1035 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1036 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1037 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1038 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1039 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1040 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1043 currSeq.printSequence(goodFile);
1046 badAccnosFile << currSeq.getName() << endl;
1047 badSeqNames.insert(currSeq.getName());
1052 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1053 unsigned long int pos = inFASTA.tellg();
1054 if ((pos == -1) || (pos >= filePos->end)) { break; }
1056 if (inFASTA.eof()) { break; }
1060 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1063 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1068 badAccnosFile.close();
1072 catch(exception& e) {
1073 m->errorOut(e, "ScreenSeqsCommand", "driver");
1077 //**********************************************************************************************************************
1079 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
1081 string outputString = "";
1082 MPI_Status statusGood;
1083 MPI_Status statusBadAccnos;
1086 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1088 for(int i=0;i<num;i++){
1090 if (m->control_pressed) { return 0; }
1092 //read next sequence
1093 int length = MPIPos[start+i+1] - MPIPos[start+i];
1095 char* buf4 = new char[length];
1096 memcpy(buf4, outputString.c_str(), length);
1098 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1100 string tempBuf = buf4; delete buf4;
1101 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1102 istringstream iss (tempBuf,istringstream::in);
1104 Sequence currSeq(iss);
1107 if (currSeq.getName() != "") {
1108 bool goodSeq = 1; // innocent until proven guilty
1109 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1110 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1111 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1112 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1113 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1114 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1117 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1120 length = outputString.length();
1121 char* buf2 = new char[length];
1122 memcpy(buf2, outputString.c_str(), length);
1124 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1129 badSeqNames.insert(currSeq.getName());
1131 //write to bad accnos file
1132 outputString = currSeq.getName() + "\n";
1134 length = outputString.length();
1135 char* buf3 = new char[length];
1136 memcpy(buf3, outputString.c_str(), length);
1138 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1144 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1149 catch(exception& e) {
1150 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1155 /**************************************************************************************************/
1157 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1159 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1163 //loop through and create all the processes you want
1164 while (process != processors) {
1168 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1170 }else if (pid == 0){
1171 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1173 //pass numSeqs to parent
1175 string tempFile = filename + toString(getpid()) + ".num.temp";
1176 m->openOutputFile(tempFile, out);
1182 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1183 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1188 //force parent to wait until all the processes are done
1189 for (int i=0;i<processors;i++) {
1190 int temp = processIDS[i];
1194 for (int i = 0; i < processIDS.size(); i++) {
1196 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1197 m->openInputFile(tempFile, in);
1198 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1199 in.close(); m->mothurRemove(tempFile);
1205 catch(exception& e) {
1206 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1211 //***************************************************************************************************************