]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
paralellized summary.seqs and added mpi code to it.
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //***************************************************************************************************************
14
15 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25                                                                         "name", "group", "alignreport","processors","outputdir","inputdir"};
26                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                         map<string,string>::iterator it;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("fasta");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
50                                 }
51                                 
52                                 it = parameters.find("group");
53                                 //user has given a template file
54                                 if(it != parameters.end()){ 
55                                         path = hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
58                                 }
59                                 
60                                 it = parameters.find("name");
61                                 //user has given a template file
62                                 if(it != parameters.end()){ 
63                                         path = hasPath(it->second);
64                                         //if the user has not given a path then, add inputdir. else leave path alone.
65                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
66                                 }
67                                 
68                                 it = parameters.find("alignreport");
69                                 //user has given a template file
70                                 if(it != parameters.end()){ 
71                                         path = hasPath(it->second);
72                                         //if the user has not given a path then, add inputdir. else leave path alone.
73                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
74                                 }
75                         }
76
77                         //check for required parameters
78                         fastafile = validParameter.validFile(parameters, "fasta", true);
79                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
80                         else if (fastafile == "not open") { abort = true; }     
81         
82                         groupfile = validParameter.validFile(parameters, "group", true);
83                         if (groupfile == "not open") { abort = true; }  
84                         else if (groupfile == "not found") { groupfile = ""; }
85                         
86                         namefile = validParameter.validFile(parameters, "name", true);
87                         if (namefile == "not open") { abort = true; }
88                         else if (namefile == "not found") { namefile = ""; }    
89
90                         alignreport = validParameter.validFile(parameters, "alignreport", true);
91                         if (alignreport == "not open") { abort = true; }
92                         else if (alignreport == "not found") { alignreport = ""; }      
93                         
94                         //if the user changes the output directory command factory will send this info to us in the output parameter 
95                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
96                                 outputDir = ""; 
97                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
98                         }
99
100                         //check for optional parameter and set defaults
101                         // ...at some point should added some additional type checking...
102                         string temp;
103                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
104                         convert(temp, startPos); 
105                 
106                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
107                         convert(temp, endPos);  
108
109                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
110                         convert(temp, maxAmbig);  
111
112                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
113                         convert(temp, maxHomoP);  
114
115                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
116                         convert(temp, minLength); 
117                         
118                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
119                         convert(temp, maxLength); 
120                         
121                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
122                         convert(temp, processors); 
123
124                 }
125
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133
134 void ScreenSeqsCommand::help(){
135         try {
136                 m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
137                 m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group and processors.\n");
138                 m->mothurOut("The fasta parameter is required.\n");
139                 m->mothurOut("The start parameter .... The default is -1.\n");
140                 m->mothurOut("The end parameter .... The default is -1.\n");
141                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
142                 m->mothurOut("The maxhomop parameter .... The default is -1.\n");
143                 m->mothurOut("The minlength parameter .... The default is -1.\n");
144                 m->mothurOut("The maxlength parameter .... The default is -1.\n");
145                 m->mothurOut("The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n");
146                 m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
147                 m->mothurOut("The screen.seqs command should be in the following format: \n");
148                 m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n");
149                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
150                 m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
151                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "ScreenSeqsCommand", "help");
156                 exit(1);
157         }
158 }
159
160 //***************************************************************************************************************
161
162 ScreenSeqsCommand::~ScreenSeqsCommand(){        /*      do nothing      */      }
163
164 //***************************************************************************************************************
165
166 int ScreenSeqsCommand::execute(){
167         try{
168                 
169                 if (abort == true) { return 0; }
170                                 
171                 string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
172                 string badSeqFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
173                 string badAccnosFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad.accnos";
174                 
175                 int numFastaSeqs = 0;
176                 set<string> badSeqNames;
177                 int start = time(NULL);
178                 
179 #ifdef USE_MPI  
180                         int pid, end, numSeqsPerProcessor; 
181                         int tag = 2001;
182                         vector<long> MPIPos;
183                         
184                         MPI_Status status; 
185                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
186                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
187
188                         MPI_File inMPI;
189                         MPI_File outMPIGood;
190                         MPI_File outMPIBad;
191                         MPI_File outMPIBadAccnos;
192                         
193                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
194                         int inMode=MPI_MODE_RDONLY; 
195                         
196                         char outGoodFilename[1024];
197                         strcpy(outGoodFilename, goodSeqFile.c_str());
198
199                         char outBadFilename[1024];
200                         strcpy(outBadFilename, badSeqFile.c_str());
201                         
202                         char outBadAccnosFilename[1024];
203                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
204
205                         char inFileName[1024];
206                         strcpy(inFileName, fastafile.c_str());
207                         
208                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
209                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
210                         MPI_File_open(MPI_COMM_WORLD, outBadFilename, outMode, MPI_INFO_NULL, &outMPIBad);
211                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
212                         
213                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
214                         
215                         if (pid == 0) { //you are the root process 
216                                 
217                                 MPIPos = setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
218                                 
219                                 //send file positions to all processes
220                                 MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
221                                 MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos   
222                                 
223                                 //figure out how many sequences you have to align
224                                 numSeqsPerProcessor = numFastaSeqs / processors;
225                                 int startIndex =  pid * numSeqsPerProcessor;
226                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
227                                 
228                                 //align your part
229                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
230
231                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); MPI_File_close(&outMPIBad);  return 0; }
232
233                                 for (int i = 1; i < processors; i++) {
234                                 
235                                         //get bad lists
236                                         int badSize;
237                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
238                                         /*for (int j = 0; j < badSize; j++) {
239                                                 int length;
240                                                 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);  //recv the length of the name
241                                                 char* buf2 = new char[length];                                                                          //make space to recieve it
242                                                 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);  //get name
243                                                 
244                                                 string tempBuf = buf2;
245                                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
246                                                 delete buf2;
247                                                 
248                                                 badSeqNames.insert(tempBuf);
249                                         }*/
250                                 }
251                         }else{ //you are a child process
252                                 MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
253                                 MPIPos.resize(numFastaSeqs+1);
254                                 MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
255                                 
256                                 //figure out how many sequences you have to align
257                                 numSeqsPerProcessor = numFastaSeqs / processors;
258                                 int startIndex =  pid * numSeqsPerProcessor;
259                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
260                                 
261                                 //align your part
262                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
263
264                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
265                                 
266                                 //send bad list 
267                                 int badSize = badSeqNames.size();
268                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
269                                 
270                                 /*
271                                 set<string>::iterator it;
272                                 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
273                                         string name = *it;
274                                         int length = name.length();
275                                         char* buf2 = new char[length];
276                                         memcpy(buf2, name.c_str(), length);
277                                         
278                                         MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
279                                         MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
280                                 }*/
281                         }
282                         
283                         //close files 
284                         MPI_File_close(&inMPI);
285                         MPI_File_close(&outMPIGood);
286                         MPI_File_close(&outMPIBad);
287                         MPI_File_close(&outMPIBadAccnos);
288                                         
289 #else
290                                         
291         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
292                         if(processors == 1){
293                                 ifstream inFASTA;
294                                 openInputFile(fastafile, inFASTA);
295                                 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
296                                 inFASTA.close();
297                                 
298                                 lines.push_back(new linePair(0, numFastaSeqs));
299                                 
300                                 driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
301                                 
302                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
303                                 
304                         }else{
305                                 vector<int> positions;
306                                 processIDS.resize(0);
307                                 
308                                 ifstream inFASTA;
309                                 openInputFile(fastafile, inFASTA);
310                                 
311                                 string input;
312                                 while(!inFASTA.eof()){
313                                         input = getline(inFASTA);
314                                         if (input.length() != 0) {
315                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
316                                         }
317                                 }
318                                 inFASTA.close();
319                                 
320                                 numFastaSeqs = positions.size();
321                                 
322                                 int numSeqsPerProcessor = numFastaSeqs / processors;
323                                 
324                                 for (int i = 0; i < processors; i++) {
325                                         long int startPos = positions[ i * numSeqsPerProcessor ];
326                                         if(i == processors - 1){
327                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
328                                         }
329                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
330                                 }
331                                 
332                                 createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); 
333                                 
334                                 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
335                                 rename((badSeqFile + toString(processIDS[0]) + ".temp").c_str(), badSeqFile.c_str());
336                                 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
337                                 
338                                 //append alignment and report files
339                                 for(int i=1;i<processors;i++){
340                                         appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
341                                         remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
342                                         
343                                         appendFiles((badSeqFile + toString(processIDS[i]) + ".temp"), badSeqFile);
344                                         remove((badSeqFile + toString(processIDS[i]) + ".temp").c_str());
345                                         
346                                         appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
347                                         remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
348                                 }
349                                 
350                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
351                                 
352                                 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
353                                 ifstream inBad;
354                                 int ableToOpen = openInputFile(badAccnosFile, inBad, "no error");
355                                 
356                                 if (ableToOpen == 0) {
357                                         badSeqNames.clear();
358                                         string tempName;
359                                         while (!inBad.eof()) {
360                                                 inBad >> tempName; gobble(inBad);
361                                                 badSeqNames.insert(tempName);
362                                         }
363                                         inBad.close();
364                                 }
365                         }
366         #else
367                         ifstream inFASTA;
368                         openInputFile(fastafile, inFASTA);
369                         numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
370                         inFASTA.close();
371                         
372                         lines.push_back(new linePair(0, numFastaSeqs));
373                         
374                         driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
375                         
376                         if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
377                         
378         #endif
379
380 #endif          
381
382                 #ifdef USE_MPI
383                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
384                                         
385                         if (pid == 0) { //only one process should fix files
386                         
387                                 //read accnos file with all names in it, process 0 just has its names
388                                 MPI_File inMPIAccnos;
389                                 MPI_Offset size;
390                         
391                                 char inFileName[1024];
392                                 strcpy(inFileName, badAccnosFile.c_str());
393                         
394                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
395                                 MPI_File_get_size(inMPIAccnos, &size);
396                         
397                                 char* buffer = new char[size];
398                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
399                         
400                                 string tempBuf = buffer;
401                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
402                                 istringstream iss (tempBuf,istringstream::in);
403
404                                 delete buffer;
405                                 MPI_File_close(&inMPIAccnos);
406                                 
407                                 badSeqNames.clear();
408                                 string tempName;
409                                 while (!iss.eof()) {
410                                         iss >> tempName; gobble(iss);
411                                         badSeqNames.insert(tempName);
412                                 }
413                 #endif
414                                                                                                                                                                         
415                 if(namefile != "" && groupfile != "")   {       
416                         screenNameGroupFile(badSeqNames);       
417                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
418                 }else if(namefile != "")        {       
419                         screenNameGroupFile(badSeqNames);
420                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } 
421                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
422                 
423                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
424
425                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
426                 
427                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
428                 
429                 #ifdef USE_MPI
430                         }
431                 #endif
432
433                 m->mothurOutEndLine();
434                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
435                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       
436                 m->mothurOut(badSeqFile); m->mothurOutEndLine();        
437                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();     
438                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
439                 m->mothurOutEndLine();
440                 m->mothurOutEndLine();
441
442                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
443                 m->mothurOutEndLine();
444
445                 return 0;
446         }
447         catch(exception& e) {
448                 m->errorOut(e, "ScreenSeqsCommand", "execute");
449                 exit(1);
450         }
451 }
452
453 //***************************************************************************************************************
454
455 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
456         try {
457                 ifstream inputNames;
458                 openInputFile(namefile, inputNames);
459                 set<string> badSeqGroups;
460                 string seqName, seqList, group;
461                 set<string>::iterator it;
462
463                 string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
464                 string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
465                 
466                 outputNames.push_back(goodNameFile);  outputNames.push_back(badNameFile);
467                 
468                 ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
469                 ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
470                 
471                 while(!inputNames.eof()){
472                         if (m->control_pressed) { goodNameOut.close(); badNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); return 0; }
473
474                         inputNames >> seqName >> seqList;
475                         it = badSeqNames.find(seqName);
476                         
477                         if(it != badSeqNames.end()){
478                                 badSeqNames.erase(it);
479                                 badNameOut << seqName << '\t' << seqList << endl;
480                                 if(namefile != ""){
481                                         int start = 0;
482                                         for(int i=0;i<seqList.length();i++){
483                                                 if(seqList[i] == ','){
484                                                         badSeqGroups.insert(seqList.substr(start,i-start));
485                                                         start = i+1;
486                                                 }                                       
487                                         }
488                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
489                                 }
490                         }
491                         else{
492                                 goodNameOut << seqName << '\t' << seqList << endl;
493                         }
494                         gobble(inputNames);
495                 }
496                 inputNames.close();
497                 goodNameOut.close();
498                 badNameOut.close();
499                 
500                 //we were unable to remove some of the bad sequences
501                 if (badSeqNames.size() != 0) {
502                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
503                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
504                                 m->mothurOutEndLine();
505                         }
506                 }
507
508                 if(groupfile != ""){
509                         
510                         ifstream inputGroups;
511                         openInputFile(groupfile, inputGroups);
512
513                         string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
514                         string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
515                         
516                         outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
517                         
518                         ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
519                         ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
520                         
521                         while(!inputGroups.eof()){
522                                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
523
524                                 inputGroups >> seqName >> group;
525
526                                 it = badSeqGroups.find(seqName);
527                                 
528                                 if(it != badSeqGroups.end()){
529                                         badSeqGroups.erase(it);
530                                         badGroupOut << seqName << '\t' << group << endl;
531                                 }
532                                 else{
533                                         goodGroupOut << seqName << '\t' << group << endl;
534                                 }
535                                 gobble(inputGroups);
536                         }
537                         inputGroups.close();
538                         goodGroupOut.close();
539                         badGroupOut.close();
540                         
541                         //we were unable to remove some of the bad sequences
542                         if (badSeqGroups.size() != 0) {
543                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
544                                         m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
545                                         m->mothurOutEndLine();
546                                 }
547                         }
548                 }
549                         
550                 return 0;
551         
552         }
553         catch(exception& e) {
554                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
555                 exit(1);
556         }
557 }
558
559 //***************************************************************************************************************
560
561 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
562         try {
563                 ifstream inputGroups;
564                 openInputFile(groupfile, inputGroups);
565                 string seqName, group;
566                 set<string>::iterator it;
567                 
568                 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
569                 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
570                 
571                 outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
572                 
573                 ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
574                 ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
575                 
576                 while(!inputGroups.eof()){
577                         if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
578
579                         inputGroups >> seqName >> group;
580                         it = badSeqNames.find(seqName);
581                         
582                         if(it != badSeqNames.end()){
583                                 badSeqNames.erase(it);
584                                 badGroupOut << seqName << '\t' << group << endl;
585                         }
586                         else{
587                                 goodGroupOut << seqName << '\t' << group << endl;
588                         }
589                         gobble(inputGroups);
590                 }
591                 
592                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
593
594                 //we were unable to remove some of the bad sequences
595                 if (badSeqNames.size() != 0) {
596                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
597                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
598                                 m->mothurOutEndLine();
599                         }
600                 }
601                 
602                 inputGroups.close();
603                 goodGroupOut.close();
604                 badGroupOut.close();
605                 
606                 if (m->control_pressed) { remove(goodGroupFile.c_str()); remove(badGroupFile.c_str());  }
607
608                 
609                 return 0;
610         
611         }
612         catch(exception& e) {
613                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
614                 exit(1);
615         }
616 }
617
618 //***************************************************************************************************************
619
620 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
621         try {
622                 ifstream inputAlignReport;
623                 openInputFile(alignreport, inputAlignReport);
624                 string seqName, group;
625                 set<string>::iterator it;
626                 
627                 string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
628                 string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
629                 
630                 outputNames.push_back(goodAlignReportFile);  outputNames.push_back(badAlignReportFile);
631                 
632                 ofstream goodAlignReportOut;    openOutputFile(goodAlignReportFile, goodAlignReportOut);
633                 ofstream badAlignReportOut;             openOutputFile(badAlignReportFile, badAlignReportOut);          
634
635                 while (!inputAlignReport.eof()) {               //      need to copy header
636                         char c = inputAlignReport.get();
637                         goodAlignReportOut << c;
638                         badAlignReportOut << c;
639                         if (c == 10 || c == 13){        break;  }       
640                 }
641
642                 while(!inputAlignReport.eof()){
643                         if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
644
645                         inputAlignReport >> seqName;
646                         it = badSeqNames.find(seqName);
647                         string line;            
648                         while (!inputAlignReport.eof()) {               //      need to copy header
649                                 char c = inputAlignReport.get();
650                                 line += c;
651                                 if (c == 10 || c == 13){        break;  }       
652                         }
653                         
654                         if(it != badSeqNames.end()){
655                                 badSeqNames.erase(it);
656                                 badAlignReportOut << seqName << '\t' << line;
657                         }
658                         else{
659                                 goodAlignReportOut << seqName << '\t' << line;
660                         }
661                         gobble(inputAlignReport);
662                 }
663                 
664                 if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
665
666                 //we were unable to remove some of the bad sequences
667                 if (badSeqNames.size() != 0) {
668                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
669                                 m->mothurOut("Your file does not include the sequence " + *it + " please correct."); 
670                                 m->mothurOutEndLine();
671                         }
672                 }
673
674                 inputAlignReport.close();
675                 goodAlignReportOut.close();
676                 badAlignReportOut.close();
677                                 
678                 if (m->control_pressed) {  remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
679                 
680                 return 0;
681         
682         }
683         catch(exception& e) {
684                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
685                 exit(1);
686         }
687         
688 }
689 //**********************************************************************************************************************
690
691 int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, string badAccnosFName, string filename, set<string>& badSeqNames){
692         try {
693                 ofstream goodFile;
694                 openOutputFile(goodFName, goodFile);
695                 
696                 ofstream badFile;
697                 openOutputFile(badFName, badFile);
698                 
699                 ofstream badAccnosFile;
700                 openOutputFile(badAccnosFName, badAccnosFile);
701                 
702                 ifstream inFASTA;
703                 openInputFile(filename, inFASTA);
704
705                 inFASTA.seekg(line->start);
706         
707                 for(int i=0;i<line->numSeqs;i++){
708                         
709                         if (m->control_pressed) {  return 0; }
710                         
711                         Sequence currSeq(inFASTA);
712                         if (currSeq.getName() != "") {
713                                 bool goodSeq = 1;               //      innocent until proven guilty
714                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
715                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
716                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
717                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
718                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
719                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
720                                 
721                                 if(goodSeq == 1){
722                                         currSeq.printSequence(goodFile);        
723                                 }
724                                 else{
725                                         currSeq.printSequence(badFile); 
726                                         badAccnosFile << currSeq.getName() << endl;
727                                         badSeqNames.insert(currSeq.getName());
728                                 }
729                         }
730                         gobble(inFASTA);
731                 }
732                 
733                         
734                 goodFile.close();
735                 inFASTA.close();
736                 badFile.close();
737                 badAccnosFile.close();
738                 
739                 return 1;
740         }
741         catch(exception& e) {
742                 m->errorOut(e, "ScreenSeqsCommand", "driver");
743                 exit(1);
744         }
745 }
746 //**********************************************************************************************************************
747 #ifdef USE_MPI
748 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badFile, MPI_File& badAccnosFile, vector<long>& MPIPos, set<string>& badSeqNames){
749         try {
750                 string outputString = "";
751                 MPI_Status statusGood; 
752                 MPI_Status statusBad; 
753                 MPI_Status statusBadAccnos; 
754                 MPI_Status status; 
755                 int pid;
756                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
757
758                 for(int i=0;i<num;i++){
759                 
760                         if (m->control_pressed) {  return 0; }
761                         
762                         //read next sequence
763                         int length = MPIPos[start+i+1] - MPIPos[start+i];
764
765                         char* buf4 = new char[length];
766                         memcpy(buf4, outputString.c_str(), length);
767
768                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
769                         
770                         string tempBuf = buf4;  delete buf4;
771                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
772                         istringstream iss (tempBuf,istringstream::in);
773                         
774                         Sequence currSeq(iss);                  
775                         
776                         //process seq
777                         if (currSeq.getName() != "") {
778                                 bool goodSeq = 1;               //      innocent until proven guilty
779                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
780                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
781                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
782                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
783                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
784                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
785                                 
786                                 if(goodSeq == 1){
787                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
788                                 
789                                         //print good seq
790                                         length = outputString.length();
791                                         char* buf2 = new char[length];
792                                         memcpy(buf2, outputString.c_str(), length);
793                                         
794                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
795                                         delete buf2;
796                                 }
797                                 else{
798                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
799                                 
800                                         //print bad seq to fasta
801                                         length = outputString.length();
802                                         char* buf2 = new char[length];
803                                         memcpy(buf2, outputString.c_str(), length);
804                                         
805                                         MPI_File_write_shared(badFile, buf2, length, MPI_CHAR, &statusBad);
806                                         delete buf2;
807
808                                         badSeqNames.insert(currSeq.getName());
809                                         
810                                         //write to bad accnos file
811                                         outputString = currSeq.getName() + "\n";
812                                 
813                                         length = outputString.length();
814                                         char* buf3 = new char[length];
815                                         memcpy(buf3, outputString.c_str(), length);
816                                         
817                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
818                                         delete buf3;
819                                 }
820                         }
821                 }
822                                 
823                 return 1;
824         }
825         catch(exception& e) {
826                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
827                 exit(1);
828         }
829 }
830 #endif
831 /**************************************************************************************************/
832
833 int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, string badAccnos, string filename, set<string>& badSeqNames) {
834         try {
835 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
836                 int process = 0;
837                 int exitCommand = 1;
838                 
839                 //loop through and create all the processes you want
840                 while (process != processors) {
841                         int pid = fork();
842                         
843                         if (pid > 0) {
844                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
845                                 process++;
846                         }else if (pid == 0){
847                                 exitCommand = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
848                                 exit(0);
849                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
850                 }
851                 
852                 //force parent to wait until all the processes are done
853                 for (int i=0;i<processors;i++) { 
854                         int temp = processIDS[i];
855                         wait(&temp);
856                 }
857                 
858                 return exitCommand;
859 #endif          
860         }
861         catch(exception& e) {
862                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
863                 exit(1);
864         }
865 }
866
867 //***************************************************************************************************************
868
869