]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
added clearcut source to mothur, fixed mislabeled sharedcalcs
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //***************************************************************************************************************
14
15 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25                                                                         "name", "group", "alignreport","processors","outputdir","inputdir"};
26                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter("screen.seqs");
32                         map<string,string>::iterator it;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("fasta");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = m->hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
50                                 }
51                                 
52                                 it = parameters.find("group");
53                                 //user has given a template file
54                                 if(it != parameters.end()){ 
55                                         path = m->hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
58                                 }
59                                 
60                                 it = parameters.find("name");
61                                 //user has given a template file
62                                 if(it != parameters.end()){ 
63                                         path = m->hasPath(it->second);
64                                         //if the user has not given a path then, add inputdir. else leave path alone.
65                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
66                                 }
67                                 
68                                 it = parameters.find("alignreport");
69                                 //user has given a template file
70                                 if(it != parameters.end()){ 
71                                         path = m->hasPath(it->second);
72                                         //if the user has not given a path then, add inputdir. else leave path alone.
73                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
74                                 }
75                         }
76
77                         //check for required parameters
78                         fastafile = validParameter.validFile(parameters, "fasta", true);
79                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
80                         else if (fastafile == "not open") { abort = true; }     
81         
82                         groupfile = validParameter.validFile(parameters, "group", true);
83                         if (groupfile == "not open") { abort = true; }  
84                         else if (groupfile == "not found") { groupfile = ""; }
85                         
86                         namefile = validParameter.validFile(parameters, "name", true);
87                         if (namefile == "not open") { abort = true; }
88                         else if (namefile == "not found") { namefile = ""; }    
89
90                         alignreport = validParameter.validFile(parameters, "alignreport", true);
91                         if (alignreport == "not open") { abort = true; }
92                         else if (alignreport == "not found") { alignreport = ""; }      
93                         
94                         //if the user changes the output directory command factory will send this info to us in the output parameter 
95                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
96                                 outputDir = ""; 
97                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
98                         }
99
100                         //check for optional parameter and set defaults
101                         // ...at some point should added some additional type checking...
102                         string temp;
103                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
104                         convert(temp, startPos); 
105                 
106                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
107                         convert(temp, endPos);  
108
109                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
110                         convert(temp, maxAmbig);  
111
112                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
113                         convert(temp, maxHomoP);  
114
115                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
116                         convert(temp, minLength); 
117                         
118                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
119                         convert(temp, maxLength); 
120                         
121                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
122                         convert(temp, processors); 
123
124                 }
125
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133
134 void ScreenSeqsCommand::help(){
135         try {
136                 m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
137                 m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group and processors.\n");
138                 m->mothurOut("The fasta parameter is required.\n");
139                 m->mothurOut("The start parameter .... The default is -1.\n");
140                 m->mothurOut("The end parameter .... The default is -1.\n");
141                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
142                 m->mothurOut("The maxhomop parameter .... The default is -1.\n");
143                 m->mothurOut("The minlength parameter .... The default is -1.\n");
144                 m->mothurOut("The maxlength parameter .... The default is -1.\n");
145                 m->mothurOut("The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n");
146                 m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
147                 m->mothurOut("The screen.seqs command should be in the following format: \n");
148                 m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n");
149                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
150                 m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
151                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "ScreenSeqsCommand", "help");
156                 exit(1);
157         }
158 }
159
160 //***************************************************************************************************************
161
162 ScreenSeqsCommand::~ScreenSeqsCommand(){        /*      do nothing      */      }
163
164 //***************************************************************************************************************
165
166 int ScreenSeqsCommand::execute(){
167         try{
168                 
169                 if (abort == true) { return 0; }
170                                 
171                 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
172                 string badAccnosFile =  outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
173                 
174                 int numFastaSeqs = 0;
175                 set<string> badSeqNames;
176                 int start = time(NULL);
177                 
178 #ifdef USE_MPI  
179                         int pid, end, numSeqsPerProcessor; 
180                         int tag = 2001;
181                         vector<unsigned long int> MPIPos;
182                         
183                         MPI_Status status; 
184                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
185                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
186
187                         MPI_File inMPI;
188                         MPI_File outMPIGood;
189                         MPI_File outMPIBadAccnos;
190                         
191                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
192                         int inMode=MPI_MODE_RDONLY; 
193                         
194                         char outGoodFilename[1024];
195                         strcpy(outGoodFilename, goodSeqFile.c_str());
196
197                         char outBadAccnosFilename[1024];
198                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
199
200                         char inFileName[1024];
201                         strcpy(inFileName, fastafile.c_str());
202                         
203                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
204                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
205                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
206                         
207                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
208                         
209                         if (pid == 0) { //you are the root process 
210                                 
211                                 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
212                                 
213                                 //send file positions to all processes
214                                 for(int i = 1; i < processors; i++) { 
215                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
216                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
217                                 }
218                                 
219                                 //figure out how many sequences you have to align
220                                 numSeqsPerProcessor = numFastaSeqs / processors;
221                                 int startIndex =  pid * numSeqsPerProcessor;
222                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
223                                 
224                                 //align your part
225                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
226
227                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
228
229                                 for (int i = 1; i < processors; i++) {
230                                 
231                                         //get bad lists
232                                         int badSize;
233                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
234                                         /*for (int j = 0; j < badSize; j++) {
235                                                 int length;
236                                                 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);  //recv the length of the name
237                                                 char* buf2 = new char[length];                                                                          //make space to recieve it
238                                                 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);  //get name
239                                                 
240                                                 string tempBuf = buf2;
241                                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
242                                                 delete buf2;
243                                                 
244                                                 badSeqNames.insert(tempBuf);
245                                         }*/
246                                 }
247                         }else{ //you are a child process
248                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
249                                 MPIPos.resize(numFastaSeqs+1);
250                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
251
252                                 //figure out how many sequences you have to align
253                                 numSeqsPerProcessor = numFastaSeqs / processors;
254                                 int startIndex =  pid * numSeqsPerProcessor;
255                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
256                                 
257                                 //align your part
258                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
259
260                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
261                                 
262                                 //send bad list 
263                                 int badSize = badSeqNames.size();
264                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
265                                 
266                                 /*
267                                 set<string>::iterator it;
268                                 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
269                                         string name = *it;
270                                         int length = name.length();
271                                         char* buf2 = new char[length];
272                                         memcpy(buf2, name.c_str(), length);
273                                         
274                                         MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
275                                         MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
276                                 }*/
277                         }
278                         
279                         //close files 
280                         MPI_File_close(&inMPI);
281                         MPI_File_close(&outMPIGood);
282                         MPI_File_close(&outMPIBadAccnos);
283                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
284                                         
285 #else
286                         vector<unsigned long int> positions = m->divideFile(fastafile, processors);
287                                 
288                         for (int i = 0; i < (positions.size()-1); i++) {
289                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
290                         }       
291                                                 
292         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
293                         if(processors == 1){
294                                 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
295                                 
296                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
297                                 
298                         }else{
299                                 processIDS.resize(0);
300                                 
301                                 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); 
302                                 
303                                 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
304                                 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
305                                 
306                                 //append alignment and report files
307                                 for(int i=1;i<processors;i++){
308                                         m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
309                                         remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
310                         
311                                         m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
312                                         remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
313                                 }
314                                 
315                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
316                                 
317                                 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
318                                 ifstream inBad;
319                                 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
320                                 
321                                 if (ableToOpen == 0) {
322                                         badSeqNames.clear();
323                                         string tempName;
324                                         while (!inBad.eof()) {
325                                                 inBad >> tempName; m->gobble(inBad);
326                                                 badSeqNames.insert(tempName);
327                                         }
328                                         inBad.close();
329                                 }
330                         }
331         #else
332                         numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
333                         
334                         if (m->control_pressed) { remove(goodSeqFile.c_str()); return 0; }
335                         
336         #endif
337
338 #endif          
339
340                 #ifdef USE_MPI
341                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
342                                         
343                         if (pid == 0) { //only one process should fix files
344                         
345                                 //read accnos file with all names in it, process 0 just has its names
346                                 MPI_File inMPIAccnos;
347                                 MPI_Offset size;
348                         
349                                 char inFileName[1024];
350                                 strcpy(inFileName, badAccnosFile.c_str());
351                         
352                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
353                                 MPI_File_get_size(inMPIAccnos, &size);
354                         
355                                 char* buffer = new char[size];
356                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
357                         
358                                 string tempBuf = buffer;
359                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
360                                 istringstream iss (tempBuf,istringstream::in);
361
362                                 delete buffer;
363                                 MPI_File_close(&inMPIAccnos);
364                                 
365                                 badSeqNames.clear();
366                                 string tempName;
367                                 while (!iss.eof()) {
368                                         iss >> tempName; m->gobble(iss);
369                                         badSeqNames.insert(tempName);
370                                 }
371                 #endif
372                                                                                                                                                                         
373                 if(namefile != "" && groupfile != "")   {       
374                         screenNameGroupFile(badSeqNames);       
375                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); return 0; }
376                 }else if(namefile != "")        {       
377                         screenNameGroupFile(badSeqNames);
378                         if (m->control_pressed) {  remove(goodSeqFile.c_str());  return 0; }    
379                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
380                 
381                 if (m->control_pressed) { remove(goodSeqFile.c_str());  return 0; }
382
383                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
384                 
385                 if (m->control_pressed) { remove(goodSeqFile.c_str());  return 0; }
386                 
387                 #ifdef USE_MPI
388                         }
389                 #endif
390
391                 m->mothurOutEndLine();
392                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
393                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       
394                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();     
395                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
396                 m->mothurOutEndLine();
397                 m->mothurOutEndLine();
398
399                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
400                 m->mothurOutEndLine();
401
402                 return 0;
403         }
404         catch(exception& e) {
405                 m->errorOut(e, "ScreenSeqsCommand", "execute");
406                 exit(1);
407         }
408 }
409
410 //***************************************************************************************************************
411
412 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
413         try {
414                 ifstream inputNames;
415                 m->openInputFile(namefile, inputNames);
416                 set<string> badSeqGroups;
417                 string seqName, seqList, group;
418                 set<string>::iterator it;
419
420                 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
421                 outputNames.push_back(goodNameFile); 
422                 
423                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
424         
425                 while(!inputNames.eof()){
426                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); remove(goodNameFile.c_str());  return 0; }
427
428                         inputNames >> seqName >> seqList;
429                         it = badSeqNames.find(seqName);
430                         
431                         if(it != badSeqNames.end()){
432                                 badSeqNames.erase(it);
433                                 
434                                 if(namefile != ""){
435                                         int start = 0;
436                                         for(int i=0;i<seqList.length();i++){
437                                                 if(seqList[i] == ','){
438                                                         badSeqGroups.insert(seqList.substr(start,i-start));
439                                                         start = i+1;
440                                                 }                                       
441                                         }
442                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
443                                 }
444                         }
445                         else{
446                                 goodNameOut << seqName << '\t' << seqList << endl;
447                         }
448                         m->gobble(inputNames);
449                 }
450                 inputNames.close();
451                 goodNameOut.close();
452         
453                 //we were unable to remove some of the bad sequences
454                 if (badSeqNames.size() != 0) {
455                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
456                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
457                                 m->mothurOutEndLine();
458                         }
459                 }
460
461                 if(groupfile != ""){
462                         
463                         ifstream inputGroups;
464                         m->openInputFile(groupfile, inputGroups);
465
466                         string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
467                         outputNames.push_back(goodGroupFile);  
468                         
469                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
470                         
471                         while(!inputGroups.eof()){
472                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str());  remove(goodGroupFile.c_str()); return 0; }
473
474                                 inputGroups >> seqName >> group;
475
476                                 it = badSeqGroups.find(seqName);
477                                 
478                                 if(it != badSeqGroups.end()){
479                                         badSeqGroups.erase(it);
480                                 }
481                                 else{
482                                         goodGroupOut << seqName << '\t' << group << endl;
483                                 }
484                                 m->gobble(inputGroups);
485                         }
486                         inputGroups.close();
487                         goodGroupOut.close();
488                         
489                         //we were unable to remove some of the bad sequences
490                         if (badSeqGroups.size() != 0) {
491                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
492                                         m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
493                                         m->mothurOutEndLine();
494                                 }
495                         }
496                 }
497                         
498                 return 0;
499         
500         }
501         catch(exception& e) {
502                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
503                 exit(1);
504         }
505 }
506
507 //***************************************************************************************************************
508
509 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
510         try {
511                 ifstream inputGroups;
512                 m->openInputFile(groupfile, inputGroups);
513                 string seqName, group;
514                 set<string>::iterator it;
515                 
516                 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
517                 outputNames.push_back(goodGroupFile); 
518                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
519                 
520                 while(!inputGroups.eof()){
521                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); return 0; }
522
523                         inputGroups >> seqName >> group;
524                         it = badSeqNames.find(seqName);
525                         
526                         if(it != badSeqNames.end()){
527                                 badSeqNames.erase(it);
528                         }
529                         else{
530                                 goodGroupOut << seqName << '\t' << group << endl;
531                         }
532                         m->gobble(inputGroups);
533                 }
534                 
535                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); remove(goodGroupFile.c_str());  return 0; }
536
537                 //we were unable to remove some of the bad sequences
538                 if (badSeqNames.size() != 0) {
539                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
540                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
541                                 m->mothurOutEndLine();
542                         }
543                 }
544                 
545                 inputGroups.close();
546                 goodGroupOut.close();
547                 
548                 if (m->control_pressed) { remove(goodGroupFile.c_str());   }
549                 
550                 return 0;
551         
552         }
553         catch(exception& e) {
554                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
555                 exit(1);
556         }
557 }
558
559 //***************************************************************************************************************
560
561 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
562         try {
563                 ifstream inputAlignReport;
564                 m->openInputFile(alignreport, inputAlignReport);
565                 string seqName, group;
566                 set<string>::iterator it;
567                 
568                 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
569                 outputNames.push_back(goodAlignReportFile);  
570                 ofstream goodAlignReportOut;    m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
571
572                 while (!inputAlignReport.eof()) {               //      need to copy header
573                         char c = inputAlignReport.get();
574                         goodAlignReportOut << c;
575                         if (c == 10 || c == 13){        break;  }       
576                 }
577
578                 while(!inputAlignReport.eof()){
579                         if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); return 0; }
580
581                         inputAlignReport >> seqName;
582                         it = badSeqNames.find(seqName);
583                         string line;            
584                         while (!inputAlignReport.eof()) {               //      need to copy header
585                                 char c = inputAlignReport.get();
586                                 line += c;
587                                 if (c == 10 || c == 13){        break;  }       
588                         }
589                         
590                         if(it != badSeqNames.end()){
591                                 badSeqNames.erase(it);
592                         }
593                         else{
594                                 goodAlignReportOut << seqName << '\t' << line;
595                         }
596                         m->gobble(inputAlignReport);
597                 }
598                 
599                 if (m->control_pressed) { goodAlignReportOut.close();  inputAlignReport.close(); remove(goodAlignReportFile.c_str());  return 0; }
600
601                 //we were unable to remove some of the bad sequences
602                 if (badSeqNames.size() != 0) {
603                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
604                                 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct."); 
605                                 m->mothurOutEndLine();
606                         }
607                 }
608
609                 inputAlignReport.close();
610                 goodAlignReportOut.close();
611                                 
612                 if (m->control_pressed) {  remove(goodAlignReportFile.c_str());  return 0; }
613                 
614                 return 0;
615         
616         }
617         catch(exception& e) {
618                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
619                 exit(1);
620         }
621         
622 }
623 //**********************************************************************************************************************
624
625 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
626         try {
627                 ofstream goodFile;
628                 m->openOutputFile(goodFName, goodFile);
629                 
630                 ofstream badAccnosFile;
631                 m->openOutputFile(badAccnosFName, badAccnosFile);
632                 
633                 ifstream inFASTA;
634                 m->openInputFile(filename, inFASTA);
635
636                 inFASTA.seekg(filePos->start);
637
638                 bool done = false;
639                 int count = 0;
640         
641                 while (!done) {
642                 
643                         if (m->control_pressed) {  return 0; }
644                         
645                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
646                         if (currSeq.getName() != "") {
647                                 bool goodSeq = 1;               //      innocent until proven guilty
648                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
649                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
650                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
651                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
652                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
653                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
654                                 
655                                 if(goodSeq == 1){
656                                         currSeq.printSequence(goodFile);        
657                                 }
658                                 else{
659                                         badAccnosFile << currSeq.getName() << endl;
660                                         badSeqNames.insert(currSeq.getName());
661                                 }
662                         count++;
663                         }
664                         
665                         unsigned long int pos = inFASTA.tellg();
666                         if ((pos == -1) || (pos >= filePos->end)) { break; }
667                         
668                         //report progress
669                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
670                 }
671                 //report progress
672                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
673                 
674                         
675                 goodFile.close();
676                 inFASTA.close();
677                 badAccnosFile.close();
678                 
679                 return count;
680         }
681         catch(exception& e) {
682                 m->errorOut(e, "ScreenSeqsCommand", "driver");
683                 exit(1);
684         }
685 }
686 //**********************************************************************************************************************
687 #ifdef USE_MPI
688 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
689         try {
690                 string outputString = "";
691                 MPI_Status statusGood; 
692                 MPI_Status statusBadAccnos; 
693                 MPI_Status status; 
694                 int pid;
695                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
696
697                 for(int i=0;i<num;i++){
698                 
699                         if (m->control_pressed) {  return 0; }
700                         
701                         //read next sequence
702                         int length = MPIPos[start+i+1] - MPIPos[start+i];
703
704                         char* buf4 = new char[length];
705                         memcpy(buf4, outputString.c_str(), length);
706
707                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
708                         
709                         string tempBuf = buf4;  delete buf4;
710                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
711                         istringstream iss (tempBuf,istringstream::in);
712                         
713                         Sequence currSeq(iss);                  
714                         
715                         //process seq
716                         if (currSeq.getName() != "") {
717                                 bool goodSeq = 1;               //      innocent until proven guilty
718                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
719                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
720                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
721                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
722                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
723                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
724                                 
725                                 if(goodSeq == 1){
726                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
727                                 
728                                         //print good seq
729                                         length = outputString.length();
730                                         char* buf2 = new char[length];
731                                         memcpy(buf2, outputString.c_str(), length);
732                                         
733                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
734                                         delete buf2;
735                                 }
736                                 else{
737
738                                         badSeqNames.insert(currSeq.getName());
739                                         
740                                         //write to bad accnos file
741                                         outputString = currSeq.getName() + "\n";
742                                 
743                                         length = outputString.length();
744                                         char* buf3 = new char[length];
745                                         memcpy(buf3, outputString.c_str(), length);
746                                         
747                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
748                                         delete buf3;
749                                 }
750                         }
751                 }
752                                 
753                 return 1;
754         }
755         catch(exception& e) {
756                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
757                 exit(1);
758         }
759 }
760 #endif
761 /**************************************************************************************************/
762
763 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
764         try {
765 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
766                 int process = 0;
767                 int num = 0;
768                 
769                 //loop through and create all the processes you want
770                 while (process != processors) {
771                         int pid = fork();
772                         
773                         if (pid > 0) {
774                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
775                                 process++;
776                         }else if (pid == 0){
777                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
778                                 
779                                 //pass numSeqs to parent
780                                 ofstream out;
781                                 string tempFile = filename + toString(getpid()) + ".num.temp";
782                                 m->openOutputFile(tempFile, out);
783                                 out << num << endl;
784                                 out.close();
785                                 
786                                 exit(0);
787                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
788                 }
789                 
790                 //force parent to wait until all the processes are done
791                 for (int i=0;i<processors;i++) { 
792                         int temp = processIDS[i];
793                         wait(&temp);
794                 }
795                 
796                 for (int i = 0; i < processIDS.size(); i++) {
797                         ifstream in;
798                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
799                         m->openInputFile(tempFile, in);
800                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
801                         in.close(); remove(tempFile.c_str());
802                 }
803                 
804                 return num;
805 #endif          
806         }
807         catch(exception& e) {
808                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
809                 exit(1);
810         }
811 }
812
813 //***************************************************************************************************************
814
815