2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
23 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
24 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
25 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
26 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
29 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
30 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
31 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33 vector<string> myArray;
34 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
38 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
42 //**********************************************************************************************************************
43 string ScreenSeqsCommand::getHelpString(){
45 string helpString = "";
46 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
47 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, optimize, criteria and processors.\n";
48 helpString += "The fasta parameter is required.\n";
49 helpString += "The start parameter .... The default is -1.\n";
50 helpString += "The end parameter .... The default is -1.\n";
51 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
52 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
53 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
54 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
55 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
56 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
57 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
58 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
59 helpString += "The screen.seqs command should be in the following format: \n";
60 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n";
61 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
62 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
63 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
67 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
71 //**********************************************************************************************************************
72 ScreenSeqsCommand::ScreenSeqsCommand(){
74 abort = true; calledHelp = true;
76 vector<string> tempOutNames;
77 outputTypes["fasta"] = tempOutNames;
78 outputTypes["name"] = tempOutNames;
79 outputTypes["group"] = tempOutNames;
80 outputTypes["alignreport"] = tempOutNames;
81 outputTypes["accnos"] = tempOutNames;
82 outputTypes["qfile"] = tempOutNames;
85 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
89 //***************************************************************************************************************
91 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
93 abort = false; calledHelp = false;
95 //allow user to run help
96 if(option == "help") { help(); abort = true; calledHelp = true; }
97 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
100 vector<string> myArray = setParameters();
102 OptionParser parser(option);
103 map<string,string> parameters = parser.getParameters();
105 ValidParameters validParameter("screen.seqs");
106 map<string,string>::iterator it;
108 //check to make sure all parameters are valid for command
109 for (it = parameters.begin(); it != parameters.end(); it++) {
110 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
113 //initialize outputTypes
114 vector<string> tempOutNames;
115 outputTypes["fasta"] = tempOutNames;
116 outputTypes["name"] = tempOutNames;
117 outputTypes["group"] = tempOutNames;
118 outputTypes["alignreport"] = tempOutNames;
119 outputTypes["accnos"] = tempOutNames;
120 outputTypes["qfile"] = tempOutNames;
122 //if the user changes the input directory command factory will send this info to us in the output parameter
123 string inputDir = validParameter.validFile(parameters, "inputdir", false);
124 if (inputDir == "not found"){ inputDir = ""; }
127 it = parameters.find("fasta");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["fasta"] = inputDir + it->second; }
135 it = parameters.find("group");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["group"] = inputDir + it->second; }
143 it = parameters.find("name");
144 //user has given a template file
145 if(it != parameters.end()){
146 path = m->hasPath(it->second);
147 //if the user has not given a path then, add inputdir. else leave path alone.
148 if (path == "") { parameters["name"] = inputDir + it->second; }
151 it = parameters.find("alignreport");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
159 it = parameters.find("qfile");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["qfile"] = inputDir + it->second; }
169 //check for required parameters
170 fastafile = validParameter.validFile(parameters, "fasta", true);
171 if (fastafile == "not found") {
172 fastafile = m->getFastaFile();
173 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
174 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
176 else if (fastafile == "not open") { abort = true; }
178 groupfile = validParameter.validFile(parameters, "group", true);
179 if (groupfile == "not open") { abort = true; }
180 else if (groupfile == "not found") { groupfile = ""; }
182 qualfile = validParameter.validFile(parameters, "qfile", true);
183 if (qualfile == "not open") { abort = true; }
184 else if (qualfile == "not found") { qualfile = ""; }
186 namefile = validParameter.validFile(parameters, "name", true);
187 if (namefile == "not open") { namefile = ""; abort = true; }
188 else if (namefile == "not found") { namefile = ""; }
190 alignreport = validParameter.validFile(parameters, "alignreport", true);
191 if (alignreport == "not open") { abort = true; }
192 else if (alignreport == "not found") { alignreport = ""; }
194 //if the user changes the output directory command factory will send this info to us in the output parameter
195 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
197 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
200 //check for optional parameter and set defaults
201 // ...at some point should added some additional type checking...
203 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
204 convert(temp, startPos);
206 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
207 convert(temp, endPos);
209 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
210 convert(temp, maxAmbig);
212 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
213 convert(temp, maxHomoP);
215 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
216 convert(temp, minLength);
218 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
219 convert(temp, maxLength);
221 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
222 m->setProcessors(temp);
223 convert(temp, processors);
225 temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
226 if (temp == "not found"){ temp = "none"; }
227 m->splitAtDash(temp, optimize);
229 //check for invalid optimize options
230 set<string> validOptimizers;
231 validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
232 for (int i = 0; i < optimize.size(); i++) {
233 if (validOptimizers.count(optimize[i]) == 0) {
234 m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
235 optimize.erase(optimize.begin()+i);
240 if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
242 temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; }
243 convert(temp, criteria);
247 catch(exception& e) {
248 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
252 //***************************************************************************************************************
254 int ScreenSeqsCommand::execute(){
257 if (abort == true) { if (calledHelp) { return 0; } return 2; }
259 //if the user want to optimize we need to know the 90% mark
260 vector<unsigned long int> positions;
261 if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
262 //use the namefile to optimize correctly
263 if (namefile != "") { nameMap = m->readNames(namefile); }
264 getSummary(positions);
267 positions = m->divideFile(fastafile, processors);
268 for (int i = 0; i < (positions.size()-1); i++) {
269 lines.push_back(new linePair(positions[i], positions[(i+1)]));
273 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
274 string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
276 int numFastaSeqs = 0;
277 set<string> badSeqNames;
278 int start = time(NULL);
281 int pid, numSeqsPerProcessor;
283 vector<unsigned long int> MPIPos;
286 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
287 MPI_Comm_size(MPI_COMM_WORLD, &processors);
291 MPI_File outMPIBadAccnos;
293 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
294 int inMode=MPI_MODE_RDONLY;
296 char outGoodFilename[1024];
297 strcpy(outGoodFilename, goodSeqFile.c_str());
299 char outBadAccnosFilename[1024];
300 strcpy(outBadAccnosFilename, badAccnosFile.c_str());
302 char inFileName[1024];
303 strcpy(inFileName, fastafile.c_str());
305 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
306 MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
307 MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
309 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
311 if (pid == 0) { //you are the root process
313 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
315 //send file positions to all processes
316 for(int i = 1; i < processors; i++) {
317 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
318 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
321 //figure out how many sequences you have to align
322 numSeqsPerProcessor = numFastaSeqs / processors;
323 int startIndex = pid * numSeqsPerProcessor;
324 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
325 // cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
327 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
328 //cout << pid << " done" << endl;
329 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
331 for (int i = 1; i < processors; i++) {
335 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
336 /*for (int j = 0; j < badSize; j++) {
338 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); //recv the length of the name
339 char* buf2 = new char[length]; //make space to recieve it
340 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //get name
342 string tempBuf = buf2;
343 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
346 badSeqNames.insert(tempBuf);
349 }else{ //you are a child process
350 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
351 MPIPos.resize(numFastaSeqs+1);
352 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
354 //figure out how many sequences you have to align
355 numSeqsPerProcessor = numFastaSeqs / processors;
356 int startIndex = pid * numSeqsPerProcessor;
357 if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
358 //cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
360 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
361 //cout << pid << " done" << endl;
362 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
365 int badSize = badSeqNames.size();
366 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
369 set<string>::iterator it;
370 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
372 int length = name.length();
373 char* buf2 = new char[length];
374 memcpy(buf2, name.c_str(), length);
376 MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
377 MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
382 MPI_File_close(&inMPI);
383 MPI_File_close(&outMPIGood);
384 MPI_File_close(&outMPIBadAccnos);
385 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
389 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
391 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
393 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
396 processIDS.resize(0);
398 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
400 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
401 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
403 //append alignment and report files
404 for(int i=1;i<processors;i++){
405 m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
406 remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
408 m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
409 remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
412 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
414 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
416 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
418 if (ableToOpen == 0) {
421 while (!inBad.eof()) {
422 inBad >> tempName; m->gobble(inBad);
423 badSeqNames.insert(tempName);
429 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
431 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
438 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
440 if (pid == 0) { //only one process should fix files
442 //read accnos file with all names in it, process 0 just has its names
443 MPI_File inMPIAccnos;
446 char inFileName[1024];
447 strcpy(inFileName, badAccnosFile.c_str());
449 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer
450 MPI_File_get_size(inMPIAccnos, &size);
452 char* buffer = new char[size];
453 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
455 string tempBuf = buffer;
456 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
457 istringstream iss (tempBuf,istringstream::in);
460 MPI_File_close(&inMPIAccnos);
465 iss >> tempName; m->gobble(iss);
466 badSeqNames.insert(tempName);
470 if(namefile != "" && groupfile != "") {
471 screenNameGroupFile(badSeqNames);
472 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
473 }else if(namefile != "") {
474 screenNameGroupFile(badSeqNames);
475 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
476 }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group
478 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
480 if(alignreport != "") { screenAlignReport(badSeqNames); }
481 if(qualfile != "") { screenQual(badSeqNames); }
483 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
489 m->mothurOutEndLine();
490 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
491 m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile);
492 m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile);
493 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
494 m->mothurOutEndLine();
495 m->mothurOutEndLine();
497 //set fasta file as new current fastafile
499 itTypes = outputTypes.find("fasta");
500 if (itTypes != outputTypes.end()) {
501 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
504 itTypes = outputTypes.find("name");
505 if (itTypes != outputTypes.end()) {
506 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
509 itTypes = outputTypes.find("group");
510 if (itTypes != outputTypes.end()) {
511 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
514 itTypes = outputTypes.find("qfile");
515 if (itTypes != outputTypes.end()) {
516 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
519 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
520 m->mothurOutEndLine();
524 catch(exception& e) {
525 m->errorOut(e, "ScreenSeqsCommand", "execute");
530 //***************************************************************************************************************
532 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
535 m->openInputFile(namefile, inputNames);
536 set<string> badSeqGroups;
537 string seqName, seqList, group;
538 set<string>::iterator it;
540 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
541 outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile);
543 ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut);
545 while(!inputNames.eof()){
546 if (m->control_pressed) { goodNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); return 0; }
548 inputNames >> seqName >> seqList;
549 it = badSeqNames.find(seqName);
551 if(it != badSeqNames.end()){
552 badSeqNames.erase(it);
556 for(int i=0;i<seqList.length();i++){
557 if(seqList[i] == ','){
558 badSeqGroups.insert(seqList.substr(start,i-start));
562 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
566 goodNameOut << seqName << '\t' << seqList << endl;
568 m->gobble(inputNames);
573 //we were unable to remove some of the bad sequences
574 if (badSeqNames.size() != 0) {
575 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
576 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
577 m->mothurOutEndLine();
583 ifstream inputGroups;
584 m->openInputFile(groupfile, inputGroups);
586 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
587 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
589 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
591 while(!inputGroups.eof()){
592 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(goodGroupFile.c_str()); return 0; }
594 inputGroups >> seqName >> group;
596 it = badSeqGroups.find(seqName);
598 if(it != badSeqGroups.end()){
599 badSeqGroups.erase(it);
602 goodGroupOut << seqName << '\t' << group << endl;
604 m->gobble(inputGroups);
607 goodGroupOut.close();
609 //we were unable to remove some of the bad sequences
610 if (badSeqGroups.size() != 0) {
611 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
612 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
613 m->mothurOutEndLine();
622 catch(exception& e) {
623 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
627 //***************************************************************************************************************
628 int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
631 vector<int> startPosition;
632 vector<int> endPosition;
633 vector<int> seqLength;
634 vector<int> ambigBases;
635 vector<int> longHomoPolymer;
637 vector<unsigned long int> positions = m->divideFile(fastafile, processors);
639 for (int i = 0; i < (positions.size()-1); i++) {
640 lines.push_back(new linePair(positions[i], positions[(i+1)]));
644 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
646 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
648 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile);
651 if (m->control_pressed) { return 0; }
653 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
654 if (m->control_pressed) { return 0; }
657 sort(startPosition.begin(), startPosition.end());
658 sort(endPosition.begin(), endPosition.end());
659 sort(seqLength.begin(), seqLength.end());
660 sort(ambigBases.begin(), ambigBases.end());
661 sort(longHomoPolymer.begin(), longHomoPolymer.end());
663 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
664 int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100));
666 for (int i = 0; i < optimize.size(); i++) {
667 if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
668 else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
669 else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
670 else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
671 else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
672 else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
677 catch(exception& e) {
678 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
682 /**************************************************************************************/
683 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {
687 m->openInputFile(filename, in);
689 in.seekg(filePos->start);
696 if (m->control_pressed) { in.close(); return 1; }
698 Sequence current(in); m->gobble(in);
700 if (current.getName() != "") {
702 if (namefile != "") {
703 //make sure this sequence is in the namefile, else error
704 map<string, int>::iterator it = nameMap.find(current.getName());
706 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
707 else { num = it->second; }
710 //for each sequence this sequence represents
711 for (int i = 0; i < num; i++) {
712 startPosition.push_back(current.getStartPos());
713 endPosition.push_back(current.getEndPos());
714 seqLength.push_back(current.getNumBases());
715 ambigBases.push_back(current.getAmbigBases());
716 longHomoPolymer.push_back(current.getLongHomoPolymer());
722 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
723 unsigned long int pos = in.tellg();
724 if ((pos == -1) || (pos >= filePos->end)) { break; }
726 if (in.eof()) { break; }
735 catch(exception& e) {
736 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
740 /**************************************************************************************************/
741 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
743 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
748 //loop through and create all the processes you want
749 while (process != processors) {
753 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
756 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
758 //pass numSeqs to parent
760 string tempFile = fastafile + toString(getpid()) + ".num.temp";
761 m->openOutputFile(tempFile, out);
764 out << startPosition.size() << endl;
765 for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl;
766 for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl;
767 for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl;
768 for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl;
769 for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl;
775 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
776 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
781 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
783 //force parent to wait until all the processes are done
784 for (int i=0;i<processIDS.size();i++) {
785 int temp = processIDS[i];
789 //parent reads in and combine Filter info
790 for (int i = 0; i < processIDS.size(); i++) {
791 string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
793 m->openInputFile(tempFilename, in);
796 in >> tempNum; m->gobble(in); num += tempNum;
797 in >> tempNum; m->gobble(in);
798 for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } m->gobble(in);
799 for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } m->gobble(in);
800 for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in);
801 for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in);
802 for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in);
805 remove(tempFilename.c_str());
811 catch(exception& e) {
812 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
817 //***************************************************************************************************************
819 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
821 ifstream inputGroups;
822 m->openInputFile(groupfile, inputGroups);
823 string seqName, group;
824 set<string>::iterator it;
826 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
827 outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile);
828 ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut);
830 while(!inputGroups.eof()){
831 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
833 inputGroups >> seqName >> group;
834 it = badSeqNames.find(seqName);
836 if(it != badSeqNames.end()){
837 badSeqNames.erase(it);
840 goodGroupOut << seqName << '\t' << group << endl;
842 m->gobble(inputGroups);
845 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
847 //we were unable to remove some of the bad sequences
848 if (badSeqNames.size() != 0) {
849 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
850 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
851 m->mothurOutEndLine();
856 goodGroupOut.close();
858 if (m->control_pressed) { remove(goodGroupFile.c_str()); }
863 catch(exception& e) {
864 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
869 //***************************************************************************************************************
871 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
873 ifstream inputAlignReport;
874 m->openInputFile(alignreport, inputAlignReport);
875 string seqName, group;
876 set<string>::iterator it;
878 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
879 outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile);
880 ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
882 while (!inputAlignReport.eof()) { // need to copy header
883 char c = inputAlignReport.get();
884 goodAlignReportOut << c;
885 if (c == 10 || c == 13){ break; }
888 while(!inputAlignReport.eof()){
889 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
891 inputAlignReport >> seqName;
892 it = badSeqNames.find(seqName);
894 while (!inputAlignReport.eof()) { // need to copy header
895 char c = inputAlignReport.get();
897 if (c == 10 || c == 13){ break; }
900 if(it != badSeqNames.end()){
901 badSeqNames.erase(it);
904 goodAlignReportOut << seqName << '\t' << line;
906 m->gobble(inputAlignReport);
909 if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
911 //we were unable to remove some of the bad sequences
912 if (badSeqNames.size() != 0) {
913 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
914 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct.");
915 m->mothurOutEndLine();
919 inputAlignReport.close();
920 goodAlignReportOut.close();
922 if (m->control_pressed) { remove(goodAlignReportFile.c_str()); return 0; }
927 catch(exception& e) {
928 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
933 //***************************************************************************************************************
935 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
938 m->openInputFile(qualfile, in);
939 set<string>::iterator it;
941 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
942 outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile);
943 ofstream goodQual; m->openOutputFile(goodQualFile, goodQual);
947 if (m->control_pressed) { goodQual.close(); in.close(); remove(goodQualFile.c_str()); return 0; }
949 string saveName = "";
955 if (name.length() != 0) {
956 saveName = name.substr(1);
959 if (c == 10 || c == 13){ break; }
966 char letter= in.get();
967 if(letter == '>'){ in.putback(letter); break; }
968 else{ scores += letter; }
973 it = badSeqNames.find(saveName);
975 if(it != badSeqNames.end()){
976 badSeqNames.erase(it);
978 goodQual << name << endl << scores;
987 //we were unable to remove some of the bad sequences
988 if (badSeqNames.size() != 0) {
989 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
990 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct.");
991 m->mothurOutEndLine();
995 if (m->control_pressed) { remove(goodQualFile.c_str()); return 0; }
1000 catch(exception& e) {
1001 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1006 //**********************************************************************************************************************
1008 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1011 m->openOutputFile(goodFName, goodFile);
1013 ofstream badAccnosFile;
1014 m->openOutputFile(badAccnosFName, badAccnosFile);
1017 m->openInputFile(filename, inFASTA);
1019 inFASTA.seekg(filePos->start);
1026 if (m->control_pressed) { return 0; }
1028 Sequence currSeq(inFASTA); m->gobble(inFASTA);
1029 if (currSeq.getName() != "") {
1030 bool goodSeq = 1; // innocent until proven guilty
1031 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1032 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1033 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1034 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1035 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1036 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1039 currSeq.printSequence(goodFile);
1042 badAccnosFile << currSeq.getName() << endl;
1043 badSeqNames.insert(currSeq.getName());
1048 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1049 unsigned long int pos = inFASTA.tellg();
1050 if ((pos == -1) || (pos >= filePos->end)) { break; }
1052 if (inFASTA.eof()) { break; }
1056 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1059 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
1064 badAccnosFile.close();
1068 catch(exception& e) {
1069 m->errorOut(e, "ScreenSeqsCommand", "driver");
1073 //**********************************************************************************************************************
1075 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
1077 string outputString = "";
1078 MPI_Status statusGood;
1079 MPI_Status statusBadAccnos;
1082 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1084 for(int i=0;i<num;i++){
1086 if (m->control_pressed) { return 0; }
1088 //read next sequence
1089 int length = MPIPos[start+i+1] - MPIPos[start+i];
1091 char* buf4 = new char[length];
1092 memcpy(buf4, outputString.c_str(), length);
1094 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1096 string tempBuf = buf4; delete buf4;
1097 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
1098 istringstream iss (tempBuf,istringstream::in);
1100 Sequence currSeq(iss);
1103 if (currSeq.getName() != "") {
1104 bool goodSeq = 1; // innocent until proven guilty
1105 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
1106 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
1107 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
1108 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
1109 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
1110 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
1113 outputString = ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1116 length = outputString.length();
1117 char* buf2 = new char[length];
1118 memcpy(buf2, outputString.c_str(), length);
1120 MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1125 badSeqNames.insert(currSeq.getName());
1127 //write to bad accnos file
1128 outputString = currSeq.getName() + "\n";
1130 length = outputString.length();
1131 char* buf3 = new char[length];
1132 memcpy(buf3, outputString.c_str(), length);
1134 MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1140 if((i) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine(); }
1145 catch(exception& e) {
1146 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1151 /**************************************************************************************************/
1153 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1155 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1159 //loop through and create all the processes you want
1160 while (process != processors) {
1164 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1166 }else if (pid == 0){
1167 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1169 //pass numSeqs to parent
1171 string tempFile = filename + toString(getpid()) + ".num.temp";
1172 m->openOutputFile(tempFile, out);
1178 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1179 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1184 //force parent to wait until all the processes are done
1185 for (int i=0;i<processors;i++) {
1186 int temp = processIDS[i];
1190 for (int i = 0; i < processIDS.size(); i++) {
1192 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1193 m->openInputFile(tempFile, in);
1194 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1195 in.close(); remove(tempFile.c_str());
1201 catch(exception& e) {
1202 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1207 //***************************************************************************************************************