]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
started work on sffinfo command. fixed bug across all paralellized commands if the...
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //***************************************************************************************************************
14
15 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25                                                                         "name", "group", "alignreport","processors","outputdir","inputdir"};
26                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                         map<string,string>::iterator it;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("fasta");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
50                                 }
51                                 
52                                 it = parameters.find("group");
53                                 //user has given a template file
54                                 if(it != parameters.end()){ 
55                                         path = hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
58                                 }
59                                 
60                                 it = parameters.find("name");
61                                 //user has given a template file
62                                 if(it != parameters.end()){ 
63                                         path = hasPath(it->second);
64                                         //if the user has not given a path then, add inputdir. else leave path alone.
65                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
66                                 }
67                                 
68                                 it = parameters.find("alignreport");
69                                 //user has given a template file
70                                 if(it != parameters.end()){ 
71                                         path = hasPath(it->second);
72                                         //if the user has not given a path then, add inputdir. else leave path alone.
73                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
74                                 }
75                         }
76
77                         //check for required parameters
78                         fastafile = validParameter.validFile(parameters, "fasta", true);
79                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
80                         else if (fastafile == "not open") { abort = true; }     
81         
82                         groupfile = validParameter.validFile(parameters, "group", true);
83                         if (groupfile == "not open") { abort = true; }  
84                         else if (groupfile == "not found") { groupfile = ""; }
85                         
86                         namefile = validParameter.validFile(parameters, "name", true);
87                         if (namefile == "not open") { abort = true; }
88                         else if (namefile == "not found") { namefile = ""; }    
89
90                         alignreport = validParameter.validFile(parameters, "alignreport", true);
91                         if (alignreport == "not open") { abort = true; }
92                         else if (alignreport == "not found") { alignreport = ""; }      
93                         
94                         //if the user changes the output directory command factory will send this info to us in the output parameter 
95                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
96                                 outputDir = ""; 
97                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
98                         }
99
100                         //check for optional parameter and set defaults
101                         // ...at some point should added some additional type checking...
102                         string temp;
103                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
104                         convert(temp, startPos); 
105                 
106                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
107                         convert(temp, endPos);  
108
109                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
110                         convert(temp, maxAmbig);  
111
112                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
113                         convert(temp, maxHomoP);  
114
115                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
116                         convert(temp, minLength); 
117                         
118                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
119                         convert(temp, maxLength); 
120                         
121                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
122                         convert(temp, processors); 
123
124                 }
125
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133
134 void ScreenSeqsCommand::help(){
135         try {
136                 m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
137                 m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group and processors.\n");
138                 m->mothurOut("The fasta parameter is required.\n");
139                 m->mothurOut("The start parameter .... The default is -1.\n");
140                 m->mothurOut("The end parameter .... The default is -1.\n");
141                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
142                 m->mothurOut("The maxhomop parameter .... The default is -1.\n");
143                 m->mothurOut("The minlength parameter .... The default is -1.\n");
144                 m->mothurOut("The maxlength parameter .... The default is -1.\n");
145                 m->mothurOut("The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n");
146                 m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
147                 m->mothurOut("The screen.seqs command should be in the following format: \n");
148                 m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n");
149                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
150                 m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
151                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "ScreenSeqsCommand", "help");
156                 exit(1);
157         }
158 }
159
160 //***************************************************************************************************************
161
162 ScreenSeqsCommand::~ScreenSeqsCommand(){        /*      do nothing      */      }
163
164 //***************************************************************************************************************
165
166 int ScreenSeqsCommand::execute(){
167         try{
168                 
169                 if (abort == true) { return 0; }
170                                 
171                 string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
172                 string badSeqFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
173                 string badAccnosFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad.accnos";
174                 
175                 int numFastaSeqs = 0;
176                 set<string> badSeqNames;
177                 int start = time(NULL);
178                 
179 #ifdef USE_MPI  
180                         int pid, end, numSeqsPerProcessor; 
181                         int tag = 2001;
182                         vector<long> MPIPos;
183                         
184                         MPI_Status status; 
185                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
186                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
187
188                         MPI_File inMPI;
189                         MPI_File outMPIGood;
190                         MPI_File outMPIBad;
191                         MPI_File outMPIBadAccnos;
192                         
193                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
194                         int inMode=MPI_MODE_RDONLY; 
195                         
196                         char outGoodFilename[1024];
197                         strcpy(outGoodFilename, goodSeqFile.c_str());
198
199                         char outBadFilename[1024];
200                         strcpy(outBadFilename, badSeqFile.c_str());
201                         
202                         char outBadAccnosFilename[1024];
203                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
204
205                         char inFileName[1024];
206                         strcpy(inFileName, fastafile.c_str());
207                         
208                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
209                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
210                         MPI_File_open(MPI_COMM_WORLD, outBadFilename, outMode, MPI_INFO_NULL, &outMPIBad);
211                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
212                         
213                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
214                         
215                         if (pid == 0) { //you are the root process 
216                                 
217                                 MPIPos = setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
218                                 
219                                 //send file positions to all processes
220                                 for(int i = 1; i < processors; i++) { 
221                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
222                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
223                                 }
224                                 
225                                 //figure out how many sequences you have to align
226                                 numSeqsPerProcessor = numFastaSeqs / processors;
227                                 int startIndex =  pid * numSeqsPerProcessor;
228                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
229                                 
230                                 //align your part
231                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
232
233                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); MPI_File_close(&outMPIBad);  return 0; }
234
235                                 for (int i = 1; i < processors; i++) {
236                                 
237                                         //get bad lists
238                                         int badSize;
239                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
240                                         /*for (int j = 0; j < badSize; j++) {
241                                                 int length;
242                                                 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);  //recv the length of the name
243                                                 char* buf2 = new char[length];                                                                          //make space to recieve it
244                                                 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);  //get name
245                                                 
246                                                 string tempBuf = buf2;
247                                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
248                                                 delete buf2;
249                                                 
250                                                 badSeqNames.insert(tempBuf);
251                                         }*/
252                                 }
253                         }else{ //you are a child process
254                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
255                                 MPIPos.resize(numFastaSeqs+1);
256                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
257
258                                 //figure out how many sequences you have to align
259                                 numSeqsPerProcessor = numFastaSeqs / processors;
260                                 int startIndex =  pid * numSeqsPerProcessor;
261                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
262                                 
263                                 //align your part
264                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
265
266                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
267                                 
268                                 //send bad list 
269                                 int badSize = badSeqNames.size();
270                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
271                                 
272                                 /*
273                                 set<string>::iterator it;
274                                 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
275                                         string name = *it;
276                                         int length = name.length();
277                                         char* buf2 = new char[length];
278                                         memcpy(buf2, name.c_str(), length);
279                                         
280                                         MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
281                                         MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
282                                 }*/
283                         }
284                         
285                         //close files 
286                         MPI_File_close(&inMPI);
287                         MPI_File_close(&outMPIGood);
288                         MPI_File_close(&outMPIBad);
289                         MPI_File_close(&outMPIBadAccnos);
290                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
291                                         
292 #else
293                                         
294         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
295                         if(processors == 1){
296                                 ifstream inFASTA;
297                                 openInputFile(fastafile, inFASTA);
298                                 getNumSeqs(inFASTA, numFastaSeqs);
299                                 inFASTA.close();
300                                 
301                                 lines.push_back(new linePair(0, numFastaSeqs));
302                                 
303                                 driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
304                                 
305                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
306                                 
307                         }else{
308                                 vector<unsigned long int> positions;
309                                 processIDS.resize(0);
310                                 
311                                 ifstream inFASTA;
312                                 openInputFile(fastafile, inFASTA);
313                                 
314                                 string input;
315                                 while(!inFASTA.eof()){
316                                         input = getline(inFASTA);
317                                         if (input.length() != 0) {
318                                                 if(input[0] == '>'){    unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
319                                         }
320                                 }
321                                 inFASTA.close();
322                                 
323                                 numFastaSeqs = positions.size();
324                         
325                                 int numSeqsPerProcessor = numFastaSeqs / processors;
326                                         
327                                 for (int i = 0; i < processors; i++) {
328                                         unsigned long int startPos = positions[ i * numSeqsPerProcessor ];
329                                         if(i == processors - 1){
330                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
331                                         }
332                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
333                 
334                                 }
335                                 
336                                 createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); 
337                                 
338                                 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
339                                 rename((badSeqFile + toString(processIDS[0]) + ".temp").c_str(), badSeqFile.c_str());
340                                 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
341                                 
342                                 //append alignment and report files
343                                 for(int i=1;i<processors;i++){
344                                         appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
345                                         remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
346                                         
347                                         appendFiles((badSeqFile + toString(processIDS[i]) + ".temp"), badSeqFile);
348                                         remove((badSeqFile + toString(processIDS[i]) + ".temp").c_str());
349                                         
350                                         appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
351                                         remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
352                                 }
353                                 
354                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
355                                 
356                                 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
357                                 ifstream inBad;
358                                 int ableToOpen = openInputFile(badAccnosFile, inBad, "no error");
359                                 
360                                 if (ableToOpen == 0) {
361                                         badSeqNames.clear();
362                                         string tempName;
363                                         while (!inBad.eof()) {
364                                                 inBad >> tempName; gobble(inBad);
365                                                 badSeqNames.insert(tempName);
366                                         }
367                                         inBad.close();
368                                 }
369                         }
370         #else
371                         ifstream inFASTA;
372                         openInputFile(fastafile, inFASTA);
373                         getNumSeqs(inFASTA, numFastaSeqs);
374                         inFASTA.close();
375                         
376                         lines.push_back(new linePair(0, numFastaSeqs));
377                         
378                         driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
379                         
380                         if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
381                         
382         #endif
383
384 #endif          
385
386                 #ifdef USE_MPI
387                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
388                                         
389                         if (pid == 0) { //only one process should fix files
390                         
391                                 //read accnos file with all names in it, process 0 just has its names
392                                 MPI_File inMPIAccnos;
393                                 MPI_Offset size;
394                         
395                                 char inFileName[1024];
396                                 strcpy(inFileName, badAccnosFile.c_str());
397                         
398                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
399                                 MPI_File_get_size(inMPIAccnos, &size);
400                         
401                                 char* buffer = new char[size];
402                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
403                         
404                                 string tempBuf = buffer;
405                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
406                                 istringstream iss (tempBuf,istringstream::in);
407
408                                 delete buffer;
409                                 MPI_File_close(&inMPIAccnos);
410                                 
411                                 badSeqNames.clear();
412                                 string tempName;
413                                 while (!iss.eof()) {
414                                         iss >> tempName; gobble(iss);
415                                         badSeqNames.insert(tempName);
416                                 }
417                 #endif
418                                                                                                                                                                         
419                 if(namefile != "" && groupfile != "")   {       
420                         screenNameGroupFile(badSeqNames);       
421                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
422                 }else if(namefile != "")        {       
423                         screenNameGroupFile(badSeqNames);
424                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } 
425                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
426                 
427                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
428
429                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
430                 
431                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
432                 
433                 #ifdef USE_MPI
434                         }
435                 #endif
436
437                 m->mothurOutEndLine();
438                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
439                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       
440                 m->mothurOut(badSeqFile); m->mothurOutEndLine();        
441                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();     
442                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
443                 m->mothurOutEndLine();
444                 m->mothurOutEndLine();
445
446                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
447                 m->mothurOutEndLine();
448
449                 return 0;
450         }
451         catch(exception& e) {
452                 m->errorOut(e, "ScreenSeqsCommand", "execute");
453                 exit(1);
454         }
455 }
456
457 //***************************************************************************************************************
458
459 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
460         try {
461                 ifstream inputNames;
462                 openInputFile(namefile, inputNames);
463                 set<string> badSeqGroups;
464                 string seqName, seqList, group;
465                 set<string>::iterator it;
466
467                 string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
468                 string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
469                 
470                 outputNames.push_back(goodNameFile);  outputNames.push_back(badNameFile);
471                 
472                 ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
473                 ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
474                 
475                 while(!inputNames.eof()){
476                         if (m->control_pressed) { goodNameOut.close(); badNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); return 0; }
477
478                         inputNames >> seqName >> seqList;
479                         it = badSeqNames.find(seqName);
480                         
481                         if(it != badSeqNames.end()){
482                                 badSeqNames.erase(it);
483                                 badNameOut << seqName << '\t' << seqList << endl;
484                                 if(namefile != ""){
485                                         int start = 0;
486                                         for(int i=0;i<seqList.length();i++){
487                                                 if(seqList[i] == ','){
488                                                         badSeqGroups.insert(seqList.substr(start,i-start));
489                                                         start = i+1;
490                                                 }                                       
491                                         }
492                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
493                                 }
494                         }
495                         else{
496                                 goodNameOut << seqName << '\t' << seqList << endl;
497                         }
498                         gobble(inputNames);
499                 }
500                 inputNames.close();
501                 goodNameOut.close();
502                 badNameOut.close();
503                 
504                 //we were unable to remove some of the bad sequences
505                 if (badSeqNames.size() != 0) {
506                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
507                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
508                                 m->mothurOutEndLine();
509                         }
510                 }
511
512                 if(groupfile != ""){
513                         
514                         ifstream inputGroups;
515                         openInputFile(groupfile, inputGroups);
516
517                         string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
518                         string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
519                         
520                         outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
521                         
522                         ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
523                         ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
524                         
525                         while(!inputGroups.eof()){
526                                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
527
528                                 inputGroups >> seqName >> group;
529
530                                 it = badSeqGroups.find(seqName);
531                                 
532                                 if(it != badSeqGroups.end()){
533                                         badSeqGroups.erase(it);
534                                         badGroupOut << seqName << '\t' << group << endl;
535                                 }
536                                 else{
537                                         goodGroupOut << seqName << '\t' << group << endl;
538                                 }
539                                 gobble(inputGroups);
540                         }
541                         inputGroups.close();
542                         goodGroupOut.close();
543                         badGroupOut.close();
544                         
545                         //we were unable to remove some of the bad sequences
546                         if (badSeqGroups.size() != 0) {
547                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
548                                         m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
549                                         m->mothurOutEndLine();
550                                 }
551                         }
552                 }
553                         
554                 return 0;
555         
556         }
557         catch(exception& e) {
558                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
559                 exit(1);
560         }
561 }
562
563 //***************************************************************************************************************
564
565 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
566         try {
567                 ifstream inputGroups;
568                 openInputFile(groupfile, inputGroups);
569                 string seqName, group;
570                 set<string>::iterator it;
571                 
572                 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
573                 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
574                 
575                 outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
576                 
577                 ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
578                 ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
579                 
580                 while(!inputGroups.eof()){
581                         if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
582
583                         inputGroups >> seqName >> group;
584                         it = badSeqNames.find(seqName);
585                         
586                         if(it != badSeqNames.end()){
587                                 badSeqNames.erase(it);
588                                 badGroupOut << seqName << '\t' << group << endl;
589                         }
590                         else{
591                                 goodGroupOut << seqName << '\t' << group << endl;
592                         }
593                         gobble(inputGroups);
594                 }
595                 
596                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
597
598                 //we were unable to remove some of the bad sequences
599                 if (badSeqNames.size() != 0) {
600                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
601                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
602                                 m->mothurOutEndLine();
603                         }
604                 }
605                 
606                 inputGroups.close();
607                 goodGroupOut.close();
608                 badGroupOut.close();
609                 
610                 if (m->control_pressed) { remove(goodGroupFile.c_str()); remove(badGroupFile.c_str());  }
611
612                 
613                 return 0;
614         
615         }
616         catch(exception& e) {
617                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
618                 exit(1);
619         }
620 }
621
622 //***************************************************************************************************************
623
624 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
625         try {
626                 ifstream inputAlignReport;
627                 openInputFile(alignreport, inputAlignReport);
628                 string seqName, group;
629                 set<string>::iterator it;
630                 
631                 string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
632                 string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
633                 
634                 outputNames.push_back(goodAlignReportFile);  outputNames.push_back(badAlignReportFile);
635                 
636                 ofstream goodAlignReportOut;    openOutputFile(goodAlignReportFile, goodAlignReportOut);
637                 ofstream badAlignReportOut;             openOutputFile(badAlignReportFile, badAlignReportOut);          
638
639                 while (!inputAlignReport.eof()) {               //      need to copy header
640                         char c = inputAlignReport.get();
641                         goodAlignReportOut << c;
642                         badAlignReportOut << c;
643                         if (c == 10 || c == 13){        break;  }       
644                 }
645
646                 while(!inputAlignReport.eof()){
647                         if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
648
649                         inputAlignReport >> seqName;
650                         it = badSeqNames.find(seqName);
651                         string line;            
652                         while (!inputAlignReport.eof()) {               //      need to copy header
653                                 char c = inputAlignReport.get();
654                                 line += c;
655                                 if (c == 10 || c == 13){        break;  }       
656                         }
657                         
658                         if(it != badSeqNames.end()){
659                                 badSeqNames.erase(it);
660                                 badAlignReportOut << seqName << '\t' << line;
661                         }
662                         else{
663                                 goodAlignReportOut << seqName << '\t' << line;
664                         }
665                         gobble(inputAlignReport);
666                 }
667                 
668                 if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
669
670                 //we were unable to remove some of the bad sequences
671                 if (badSeqNames.size() != 0) {
672                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
673                                 m->mothurOut("Your file does not include the sequence " + *it + " please correct."); 
674                                 m->mothurOutEndLine();
675                         }
676                 }
677
678                 inputAlignReport.close();
679                 goodAlignReportOut.close();
680                 badAlignReportOut.close();
681                                 
682                 if (m->control_pressed) {  remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
683                 
684                 return 0;
685         
686         }
687         catch(exception& e) {
688                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
689                 exit(1);
690         }
691         
692 }
693 //**********************************************************************************************************************
694
695 int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, string badAccnosFName, string filename, set<string>& badSeqNames){
696         try {
697                 ofstream goodFile;
698                 openOutputFile(goodFName, goodFile);
699                 
700                 ofstream badFile;
701                 openOutputFile(badFName, badFile);
702                 
703                 ofstream badAccnosFile;
704                 openOutputFile(badAccnosFName, badAccnosFile);
705                 
706                 ifstream inFASTA;
707                 openInputFile(filename, inFASTA);
708
709                 inFASTA.seekg(line->start);
710         
711                 for(int i=0;i<line->numSeqs;i++){
712                 
713                         if (m->control_pressed) {  return 0; }
714                         
715                         Sequence currSeq(inFASTA);
716                         if (currSeq.getName() != "") {
717                                 bool goodSeq = 1;               //      innocent until proven guilty
718                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
719                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
720                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
721                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
722                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
723                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
724                                 
725                                 if(goodSeq == 1){
726                                         currSeq.printSequence(goodFile);        
727                                 }
728                                 else{
729                                         currSeq.printSequence(badFile); 
730                                         badAccnosFile << currSeq.getName() << endl;
731                                         badSeqNames.insert(currSeq.getName());
732                                 }
733                         }
734                         gobble(inFASTA);
735                 }
736                 
737                         
738                 goodFile.close();
739                 inFASTA.close();
740                 badFile.close();
741                 badAccnosFile.close();
742                 
743                 return 1;
744         }
745         catch(exception& e) {
746                 m->errorOut(e, "ScreenSeqsCommand", "driver");
747                 exit(1);
748         }
749 }
750 //**********************************************************************************************************************
751 #ifdef USE_MPI
752 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badFile, MPI_File& badAccnosFile, vector<long>& MPIPos, set<string>& badSeqNames){
753         try {
754                 string outputString = "";
755                 MPI_Status statusGood; 
756                 MPI_Status statusBad; 
757                 MPI_Status statusBadAccnos; 
758                 MPI_Status status; 
759                 int pid;
760                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
761
762                 for(int i=0;i<num;i++){
763                 
764                         if (m->control_pressed) {  return 0; }
765                         
766                         //read next sequence
767                         int length = MPIPos[start+i+1] - MPIPos[start+i];
768
769                         char* buf4 = new char[length];
770                         memcpy(buf4, outputString.c_str(), length);
771
772                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
773                         
774                         string tempBuf = buf4;  delete buf4;
775                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
776                         istringstream iss (tempBuf,istringstream::in);
777                         
778                         Sequence currSeq(iss);                  
779                         
780                         //process seq
781                         if (currSeq.getName() != "") {
782                                 bool goodSeq = 1;               //      innocent until proven guilty
783                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
784                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
785                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
786                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
787                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
788                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
789                                 
790                                 if(goodSeq == 1){
791                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
792                                 
793                                         //print good seq
794                                         length = outputString.length();
795                                         char* buf2 = new char[length];
796                                         memcpy(buf2, outputString.c_str(), length);
797                                         
798                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
799                                         delete buf2;
800                                 }
801                                 else{
802                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
803                                 
804                                         //print bad seq to fasta
805                                         length = outputString.length();
806                                         char* buf2 = new char[length];
807                                         memcpy(buf2, outputString.c_str(), length);
808                                         
809                                         MPI_File_write_shared(badFile, buf2, length, MPI_CHAR, &statusBad);
810                                         delete buf2;
811
812                                         badSeqNames.insert(currSeq.getName());
813                                         
814                                         //write to bad accnos file
815                                         outputString = currSeq.getName() + "\n";
816                                 
817                                         length = outputString.length();
818                                         char* buf3 = new char[length];
819                                         memcpy(buf3, outputString.c_str(), length);
820                                         
821                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
822                                         delete buf3;
823                                 }
824                         }
825                 }
826                                 
827                 return 1;
828         }
829         catch(exception& e) {
830                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
831                 exit(1);
832         }
833 }
834 #endif
835 /**************************************************************************************************/
836
837 int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, string badAccnos, string filename, set<string>& badSeqNames) {
838         try {
839 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
840                 int process = 0;
841                 int exitCommand = 1;
842                 
843                 //loop through and create all the processes you want
844                 while (process != processors) {
845                         int pid = fork();
846                         
847                         if (pid > 0) {
848                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
849                                 process++;
850                         }else if (pid == 0){
851                                 exitCommand = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
852                                 exit(0);
853                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
854                 }
855                 
856                 //force parent to wait until all the processes are done
857                 for (int i=0;i<processors;i++) { 
858                         int temp = processIDS[i];
859                         wait(&temp);
860                 }
861                 
862                 return exitCommand;
863 #endif          
864         }
865         catch(exception& e) {
866                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
867                 exit(1);
868         }
869 }
870
871 //***************************************************************************************************************
872
873