2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
20 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
28 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
41 //**********************************************************************************************************************
42 string ScreenSeqsCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
46 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, optimize, criteria and processors.\n";
47 helpString += "The fasta parameter is required.\n";
48 helpString += "The start parameter .... The default is -1.\n";
49 helpString += "The end parameter .... The default is -1.\n";
50 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
51 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
52 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
53 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
54 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
55 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
56 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
57 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
58 helpString += "The screen.seqs command should be in the following format: \n";
59 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
60 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
61 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
62 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
66 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
70 //**********************************************************************************************************************
71 ScreenSeqsCommand::ScreenSeqsCommand(){
73 abort = true; calledHelp = true;
75 vector<string> tempOutNames;
76 outputTypes["fasta"] = tempOutNames;
77 outputTypes["name"] = tempOutNames;
78 outputTypes["group"] = tempOutNames;
79 outputTypes["alignreport"] = tempOutNames;
80 outputTypes["accnos"] = tempOutNames;
83 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
87 //***************************************************************************************************************
89 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
91 abort = false; calledHelp = false;
93 //allow user to run help
94 if(option == "help") { help(); abort = true; calledHelp = true; }
97 vector<string> myArray = setParameters();
99 OptionParser parser(option);
100 map<string,string> parameters = parser.getParameters();
102 ValidParameters validParameter("screen.seqs");
103 map<string,string>::iterator it;
105 //check to make sure all parameters are valid for command
106 for (it = parameters.begin(); it != parameters.end(); it++) {
107 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
110 //initialize outputTypes
111 vector<string> tempOutNames;
112 outputTypes["fasta"] = tempOutNames;
113 outputTypes["name"] = tempOutNames;
114 outputTypes["group"] = tempOutNames;
115 outputTypes["alignreport"] = tempOutNames;
116 outputTypes["accnos"] = tempOutNames;
118 //if the user changes the input directory command factory will send this info to us in the output parameter
119 string inputDir = validParameter.validFile(parameters, "inputdir", false);
120 if (inputDir == "not found"){ inputDir = ""; }
123 it = parameters.find("fasta");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["fasta"] = inputDir + it->second; }
131 it = parameters.find("group");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["group"] = inputDir + it->second; }
139 it = parameters.find("name");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["name"] = inputDir + it->second; }
147 it = parameters.find("alignreport");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
156 //check for required parameters
157 fastafile = validParameter.validFile(parameters, "fasta", true);
158 if (fastafile == "not found") {
159 fastafile = m->getFastaFile();
160 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
161 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
163 else if (fastafile == "not open") { abort = true; }
165 groupfile = validParameter.validFile(parameters, "group", true);
166 if (groupfile == "not open") { abort = true; }
167 else if (groupfile == "not found") { groupfile = ""; }
169 namefile = validParameter.validFile(parameters, "name", true);
170 if (namefile == "not open") { namefile = ""; abort = true; }
171 else if (namefile == "not found") { namefile = ""; }
173 alignreport = validParameter.validFile(parameters, "alignreport", true);
174 if (alignreport == "not open") { abort = true; }
175 else if (alignreport == "not found") { alignreport = ""; }
177 //if the user changes the output directory command factory will send this info to us in the output parameter
178 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
180 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
183 //check for optional parameter and set defaults
184 // ...at some point should added some additional type checking...
186 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
187 convert(temp, startPos);
189 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
190 convert(temp, endPos);
192 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
193 convert(temp, maxAmbig);
195 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
196 convert(temp, maxHomoP);
198 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
199 convert(temp, minLength);
201 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
202 convert(temp, maxLength);
204 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
205 m->setProcessors(temp);
206 convert(temp, processors);
208 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
209 if (temp == "not found"){ temp = "none"; }
210 m->splitAtDash(temp, optimize);
212 //check for invalid optimize options
213 set<string> validOptimizers;
214 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
215 for (int i = 0; i < optimize.size(); i++) {
216 if (validOptimizers.count(optimize[i]) == 0) {
217 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
218 optimize.erase(optimize.begin()+i);
223 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
225 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
226 convert(temp, criteria);
230 catch(exception& e) {
231 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
235 //***************************************************************************************************************
237 int ScreenSeqsCommand::execute(){
240 if (abort == true) { if (calledHelp) { return 0; } return 2; }
242 //if the user want to optimize we need to know the 90% mark
243 vector<unsigned long int> positions;
244 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
245 //use the namefile to optimize correctly
246 if (namefile != "") { nameMap = m->readNames(namefile); }
247 getSummary(positions);
250 positions = m->divideFile(fastafile, processors);
251 for (int i = 0; i < (positions.size()-1); i++) {
252 lines.push_back(new linePair(positions[i], positions[(i+1)]));
256 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
257 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
259 int numFastaSeqs = 0;
260 set<string> badSeqNames;
261 int start = time(NULL);
264 int pid, numSeqsPerProcessor;
266 vector<unsigned long int> MPIPos;
269 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
270 MPI_Comm_size(MPI_COMM_WORLD, &processors);
274 MPI_File outMPIBadAccnos;
276 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
277 int inMode=MPI_MODE_RDONLY;
279 char outGoodFilename[1024];
280 strcpy(outGoodFilename, goodSeqFile.c_str());
282 char outBadAccnosFilename[1024];
283 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
285 char inFileName[1024];
286 strcpy(inFileName, fastafile.c_str());
288 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
289 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
290 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
292 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
294 if (pid == 0) { //you are the root process
296 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
298 //send file positions to all processes
299 for(int i = 1; i < processors; i++) {
300 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
301 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
304 //figure out how many sequences you have to align
305 numSeqsPerProcessor = numFastaSeqs / processors;
306 int startIndex = pid * numSeqsPerProcessor;
307 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
308 // cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
310 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
311 //cout << pid << " done" << endl;
312 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
314 for (int i = 1; i < processors; i++) {
318 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
319 /*for (int j = 0; j < badSize; j++) {
321 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); //recv the length of the name
322 char* buf2 = new char[length]; //make space to recieve it
323 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //get name
325 string tempBuf = buf2;
326 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
329 badSeqNames.insert(tempBuf);
332 }else{ //you are a child process
333 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
334 MPIPos.resize(numFastaSeqs+1);
335 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
337 //figure out how many sequences you have to align
338 numSeqsPerProcessor = numFastaSeqs / processors;
339 int startIndex = pid * numSeqsPerProcessor;
340 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
341 //cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
343 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
344 //cout << pid << " done" << endl;
345 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
348 int badSize = badSeqNames.size();
349 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
352 set<string>::iterator it;
353 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
355 int length = name.length();
356 char* buf2 = new char[length];
357 memcpy(buf2, name.c_str(), length);
359 MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
360 MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
365 MPI_File_close(&inMPI);
366 MPI_File_close(&outMPIGood);
367 MPI_File_close(&outMPIBadAccnos);
368 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
372 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
374 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
376 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
379 processIDS.resize(0);
381 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
383 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
384 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
386 //append alignment and report files
387 for(int i=1;i<processors;i++){
388 m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
389 remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
391 m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
392 remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
395 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
397 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
399 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
401 if (ableToOpen == 0) {
404 while (!inBad.eof()) {
405 inBad >> tempName; m->gobble(inBad);
406 badSeqNames.insert(tempName);
412 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
414 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
421 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
423 if (pid == 0) { //only one process should fix files
425 //read accnos file with all names in it, process 0 just has its names
426 MPI_File inMPIAccnos;
429 char inFileName[1024];
430 strcpy(inFileName, badAccnosFile.c_str());
432 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
433 MPI_File_get_size(inMPIAccnos, &size);
435 char* buffer = new char[size];
436 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
438 string tempBuf = buffer;
439 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
440 istringstream iss (tempBuf,istringstream::in);
443 MPI_File_close(&inMPIAccnos);
448 iss >> tempName; m->gobble(iss);
449 badSeqNames.insert(tempName);
453 if(namefile != "" && groupfile != "") {
454 screenNameGroupFile(badSeqNames);
455 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
456 }else if(namefile != "") {
457 screenNameGroupFile(badSeqNames);
458 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
459 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
461 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
463 if(alignreport != "") { screenAlignReport(badSeqNames); }
465 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
471 m->mothurOutEndLine();
472 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
473 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
474 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
475 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
476 m->mothurOutEndLine();
477 m->mothurOutEndLine();
479 //set fasta file as new current fastafile
481 itTypes = outputTypes.find("fasta");
482 if (itTypes != outputTypes.end()) {
483 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
486 itTypes = outputTypes.find("name");
487 if (itTypes != outputTypes.end()) {
488 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
491 itTypes = outputTypes.find("group");
492 if (itTypes != outputTypes.end()) {
493 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
496 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
497 m->mothurOutEndLine();
501 catch(exception& e) {
502 m->errorOut(e, "ScreenSeqsCommand", "execute");
507 //***************************************************************************************************************
509 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
512 m->openInputFile(namefile, inputNames);
513 set<string> badSeqGroups;
514 string seqName, seqList, group;
515 set<string>::iterator it;
517 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
518 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
520 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
522 while(!inputNames.eof()){
523 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); return 0; }
525 inputNames >> seqName >> seqList;
526 it = badSeqNames.find(seqName);
528 if(it != badSeqNames.end()){
529 badSeqNames.erase(it);
533 for(int i=0;i<seqList.length();i++){
534 if(seqList[i] == ','){
535 badSeqGroups.insert(seqList.substr(start,i-start));
539 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
543 goodNameOut << seqName << '\t' << seqList << endl;
545 m->gobble(inputNames);
550 //we were unable to remove some of the bad sequences
551 if (badSeqNames.size() != 0) {
552 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
553 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
554 m->mothurOutEndLine();
560 ifstream inputGroups;
561 m->openInputFile(groupfile, inputGroups);
563 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
564 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
566 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
568 while(!inputGroups.eof()){
569 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(goodGroupFile.c_str()); return 0; }
571 inputGroups >> seqName >> group;
573 it = badSeqGroups.find(seqName);
575 if(it != badSeqGroups.end()){
576 badSeqGroups.erase(it);
579 goodGroupOut << seqName << '\t' << group << endl;
581 m->gobble(inputGroups);
584 goodGroupOut.close();
586 //we were unable to remove some of the bad sequences
587 if (badSeqGroups.size() != 0) {
588 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
589 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
590 m->mothurOutEndLine();
599 catch(exception& e) {
600 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
604 //***************************************************************************************************************
605 int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
608 vector<int> startPosition;
609 vector<int> endPosition;
610 vector<int> seqLength;
611 vector<int> ambigBases;
612 vector<int> longHomoPolymer;
614 vector<unsigned long int> positions = m->divideFile(fastafile, processors);
616 for (int i = 0; i < (positions.size()-1); i++) {
617 lines.push_back(new linePair(positions[i], positions[(i+1)]));
621 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
623 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
625 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
628 if (m->control_pressed) { return 0; }
630 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
631 if (m->control_pressed) { return 0; }
634 sort(startPosition.begin(), startPosition.end());
635 sort(endPosition.begin(), endPosition.end());
636 sort(seqLength.begin(), seqLength.end());
637 sort(ambigBases.begin(), ambigBases.end());
638 sort(longHomoPolymer.begin(), longHomoPolymer.end());
640 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
641 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
643 for (int i = 0; i < optimize.size(); i++) {
644 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
645 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
646 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
647 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
648 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
649 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
654 catch(exception& e) {
655 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
659 /**************************************************************************************/
660 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {
664 m->openInputFile(filename, in);
666 in.seekg(filePos->start);
673 if (m->control_pressed) { in.close(); return 1; }
675 Sequence current(in); m->gobble(in);
677 if (current.getName() != "") {
679 if (namefile != "") {
680 //make sure this sequence is in the namefile, else error
681 map<string, int>::iterator it = nameMap.find(current.getName());
683 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
684 else { num = it->second; }
687 //for each sequence this sequence represents
688 for (int i = 0; i < num; i++) {
689 startPosition.push_back(current.getStartPos());
690 endPosition.push_back(current.getEndPos());
691 seqLength.push_back(current.getNumBases());
692 ambigBases.push_back(current.getAmbigBases());
693 longHomoPolymer.push_back(current.getLongHomoPolymer());
699 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
700 unsigned long int pos = in.tellg();
701 if ((pos == -1) || (pos >= filePos->end)) { break; }
703 if (in.eof()) { break; }
712 catch(exception& e) {
713 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
717 /**************************************************************************************************/
718 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
720 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
725 //loop through and create all the processes you want
726 while (process != processors) {
730 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
733 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
735 //pass numSeqs to parent
737 string tempFile = fastafile + toString(getpid()) + ".num.temp";
738 m->openOutputFile(tempFile, out);
741 out << startPosition.size() << endl;
742 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
743 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
744 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
745 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
746 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
752 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
753 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
758 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
760 //force parent to wait until all the processes are done
761 for (int i=0;i<processIDS.size();i++) {
762 int temp = processIDS[i];
766 //parent reads in and combine Filter info
767 for (int i = 0; i < processIDS.size(); i++) {
768 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
770 m->openInputFile(tempFilename, in);
773 in >> tempNum; m->gobble(in); num += tempNum;
774 in >> tempNum; m->gobble(in);
775 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
776 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
777 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
778 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
779 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
782 remove(tempFilename.c_str());
788 catch(exception& e) {
789 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
794 //***************************************************************************************************************
796 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
798 ifstream inputGroups;
799 m->openInputFile(groupfile, inputGroups);
800 string seqName, group;
801 set<string>::iterator it;
803 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
804 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
805 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
807 while(!inputGroups.eof()){
808 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
810 inputGroups >> seqName >> group;
811 it = badSeqNames.find(seqName);
813 if(it != badSeqNames.end()){
814 badSeqNames.erase(it);
817 goodGroupOut << seqName << '\t' << group << endl;
819 m->gobble(inputGroups);
822 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
824 //we were unable to remove some of the bad sequences
825 if (badSeqNames.size() != 0) {
826 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
827 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
828 m->mothurOutEndLine();
833 goodGroupOut.close();
835 if (m->control_pressed) { remove(goodGroupFile.c_str()); }
840 catch(exception& e) {
841 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
846 //***************************************************************************************************************
848 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
850 ifstream inputAlignReport;
851 m->openInputFile(alignreport, inputAlignReport);
852 string seqName, group;
853 set<string>::iterator it;
855 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
856 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
857 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
859 while (!inputAlignReport.eof()) { // need to copy header
860 char c = inputAlignReport.get();
861 goodAlignReportOut << c;
862 if (c == 10 || c == 13){ break; }
865 while(!inputAlignReport.eof()){
866 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
868 inputAlignReport >> seqName;
869 it = badSeqNames.find(seqName);
871 while (!inputAlignReport.eof()) { // need to copy header
872 char c = inputAlignReport.get();
874 if (c == 10 || c == 13){ break; }
877 if(it != badSeqNames.end()){
878 badSeqNames.erase(it);
881 goodAlignReportOut << seqName << '\t' << line;
883 m->gobble(inputAlignReport);
886 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
888 //we were unable to remove some of the bad sequences
889 if (badSeqNames.size() != 0) {
890 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
891 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
892 m->mothurOutEndLine();
896 inputAlignReport.close();
897 goodAlignReportOut.close();
899 if (m->control_pressed) { remove(goodAlignReportFile.c_str()); return 0; }
904 catch(exception& e) {
905 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
910 //**********************************************************************************************************************
912 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
915 m->openOutputFile(goodFName, goodFile);
917 ofstream badAccnosFile;
918 m->openOutputFile(badAccnosFName, badAccnosFile);
921 m->openInputFile(filename, inFASTA);
923 inFASTA.seekg(filePos->start);
930 if (m->control_pressed) { return 0; }
932 Sequence currSeq(inFASTA); m->gobble(inFASTA);
933 if (currSeq.getName() != "") {
934 bool goodSeq = 1; // innocent until proven guilty
935 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
936 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
937 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
938 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
939 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
940 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
943 currSeq.printSequence(goodFile);
946 badAccnosFile << currSeq.getName() << endl;
947 badSeqNames.insert(currSeq.getName());
952 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
953 unsigned long int pos = inFASTA.tellg();
954 if ((pos == -1) || (pos >= filePos->end)) { break; }
956 if (inFASTA.eof()) { break; }
960 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
963 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
968 badAccnosFile.close();
972 catch(exception& e) {
973 m->errorOut(e, "ScreenSeqsCommand", "driver");
977 //**********************************************************************************************************************
979 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
981 string outputString = "";
982 MPI_Status statusGood;
983 MPI_Status statusBadAccnos;
986 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
988 for(int i=0;i<num;i++){
990 if (m->control_pressed) { return 0; }
993 int length = MPIPos[start+i+1] - MPIPos[start+i];
995 char* buf4 = new char[length];
996 memcpy(buf4, outputString.c_str(), length);
998 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1000 string tempBuf = buf4; delete buf4;
1001 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1002 istringstream iss (tempBuf,istringstream::in);
1004 Sequence currSeq(iss);
1007 if (currSeq.getName() != "") {
1008 bool goodSeq = 1; // innocent until proven guilty
1009 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1010 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1011 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1012 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1013 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1014 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1017 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1020 length = outputString.length();
1021 char* buf2 = new char[length];
1022 memcpy(buf2, outputString.c_str(), length);
1024 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1029 badSeqNames.insert(currSeq.getName());
1031 //write to bad accnos file
1032 outputString = currSeq.getName() + "\n";
1034 length = outputString.length();
1035 char* buf3 = new char[length];
1036 memcpy(buf3, outputString.c_str(), length);
1038 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1044 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1049 catch(exception& e) {
1050 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1055 /**************************************************************************************************/
1057 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1059 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1063 //loop through and create all the processes you want
1064 while (process != processors) {
1068 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1070 }else if (pid == 0){
1071 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1073 //pass numSeqs to parent
1075 string tempFile = filename + toString(getpid()) + ".num.temp";
1076 m->openOutputFile(tempFile, out);
1082 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1083 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1088 //force parent to wait until all the processes are done
1089 for (int i=0;i<processors;i++) {
1090 int temp = processIDS[i];
1094 for (int i = 0; i < processIDS.size(); i++) {
1096 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1097 m->openInputFile(tempFile, in);
1098 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1099 in.close(); remove(tempFile.c_str());
1105 catch(exception& e) {
1106 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1111 //***************************************************************************************************************