]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
changed how we count sequences in a fastafile to allow for '>' in sequence names
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //***************************************************************************************************************
14
15 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25                                                                         "name", "group", "alignreport","processors","outputdir","inputdir"};
26                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                         map<string,string>::iterator it;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("fasta");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
50                                 }
51                                 
52                                 it = parameters.find("group");
53                                 //user has given a template file
54                                 if(it != parameters.end()){ 
55                                         path = hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
58                                 }
59                                 
60                                 it = parameters.find("name");
61                                 //user has given a template file
62                                 if(it != parameters.end()){ 
63                                         path = hasPath(it->second);
64                                         //if the user has not given a path then, add inputdir. else leave path alone.
65                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
66                                 }
67                                 
68                                 it = parameters.find("alignreport");
69                                 //user has given a template file
70                                 if(it != parameters.end()){ 
71                                         path = hasPath(it->second);
72                                         //if the user has not given a path then, add inputdir. else leave path alone.
73                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
74                                 }
75                         }
76
77                         //check for required parameters
78                         fastafile = validParameter.validFile(parameters, "fasta", true);
79                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
80                         else if (fastafile == "not open") { abort = true; }     
81         
82                         groupfile = validParameter.validFile(parameters, "group", true);
83                         if (groupfile == "not open") { abort = true; }  
84                         else if (groupfile == "not found") { groupfile = ""; }
85                         
86                         namefile = validParameter.validFile(parameters, "name", true);
87                         if (namefile == "not open") { abort = true; }
88                         else if (namefile == "not found") { namefile = ""; }    
89
90                         alignreport = validParameter.validFile(parameters, "alignreport", true);
91                         if (alignreport == "not open") { abort = true; }
92                         else if (alignreport == "not found") { alignreport = ""; }      
93                         
94                         //if the user changes the output directory command factory will send this info to us in the output parameter 
95                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
96                                 outputDir = ""; 
97                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
98                         }
99
100                         //check for optional parameter and set defaults
101                         // ...at some point should added some additional type checking...
102                         string temp;
103                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
104                         convert(temp, startPos); 
105                 
106                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
107                         convert(temp, endPos);  
108
109                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
110                         convert(temp, maxAmbig);  
111
112                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
113                         convert(temp, maxHomoP);  
114
115                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
116                         convert(temp, minLength); 
117                         
118                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
119                         convert(temp, maxLength); 
120                         
121                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
122                         convert(temp, processors); 
123
124                 }
125
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133
134 void ScreenSeqsCommand::help(){
135         try {
136                 m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
137                 m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group and processors.\n");
138                 m->mothurOut("The fasta parameter is required.\n");
139                 m->mothurOut("The start parameter .... The default is -1.\n");
140                 m->mothurOut("The end parameter .... The default is -1.\n");
141                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
142                 m->mothurOut("The maxhomop parameter .... The default is -1.\n");
143                 m->mothurOut("The minlength parameter .... The default is -1.\n");
144                 m->mothurOut("The maxlength parameter .... The default is -1.\n");
145                 m->mothurOut("The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n");
146                 m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
147                 m->mothurOut("The screen.seqs command should be in the following format: \n");
148                 m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n");
149                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
150                 m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
151                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "ScreenSeqsCommand", "help");
156                 exit(1);
157         }
158 }
159
160 //***************************************************************************************************************
161
162 ScreenSeqsCommand::~ScreenSeqsCommand(){        /*      do nothing      */      }
163
164 //***************************************************************************************************************
165
166 int ScreenSeqsCommand::execute(){
167         try{
168                 
169                 if (abort == true) { return 0; }
170                                 
171                 string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
172                 string badSeqFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
173                 string badAccnosFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad.accnos";
174                 
175                 int numFastaSeqs = 0;
176                 set<string> badSeqNames;
177                 int start = time(NULL);
178                 
179 #ifdef USE_MPI  
180                         int pid, end, numSeqsPerProcessor; 
181                         int tag = 2001;
182                         vector<long> MPIPos;
183                         
184                         MPI_Status status; 
185                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
186                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
187
188                         MPI_File inMPI;
189                         MPI_File outMPIGood;
190                         MPI_File outMPIBad;
191                         MPI_File outMPIBadAccnos;
192                         
193                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
194                         int inMode=MPI_MODE_RDONLY; 
195                         
196                         char outGoodFilename[1024];
197                         strcpy(outGoodFilename, goodSeqFile.c_str());
198
199                         char outBadFilename[1024];
200                         strcpy(outBadFilename, badSeqFile.c_str());
201                         
202                         char outBadAccnosFilename[1024];
203                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
204
205                         char inFileName[1024];
206                         strcpy(inFileName, fastafile.c_str());
207                         
208                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
209                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
210                         MPI_File_open(MPI_COMM_WORLD, outBadFilename, outMode, MPI_INFO_NULL, &outMPIBad);
211                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
212                         
213                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
214                         
215                         if (pid == 0) { //you are the root process 
216                                 
217                                 MPIPos = setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
218                                 
219                                 //send file positions to all processes
220                                 for(int i = 1; i < processors; i++) { 
221                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
222                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
223                                 }
224                                 
225                                 //figure out how many sequences you have to align
226                                 numSeqsPerProcessor = numFastaSeqs / processors;
227                                 int startIndex =  pid * numSeqsPerProcessor;
228                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
229                                 
230                                 //align your part
231                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
232
233                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); MPI_File_close(&outMPIBad);  return 0; }
234
235                                 for (int i = 1; i < processors; i++) {
236                                 
237                                         //get bad lists
238                                         int badSize;
239                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
240                                         /*for (int j = 0; j < badSize; j++) {
241                                                 int length;
242                                                 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);  //recv the length of the name
243                                                 char* buf2 = new char[length];                                                                          //make space to recieve it
244                                                 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);  //get name
245                                                 
246                                                 string tempBuf = buf2;
247                                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
248                                                 delete buf2;
249                                                 
250                                                 badSeqNames.insert(tempBuf);
251                                         }*/
252                                 }
253                         }else{ //you are a child process
254                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
255                                 MPIPos.resize(numFastaSeqs+1);
256                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
257
258                                 //figure out how many sequences you have to align
259                                 numSeqsPerProcessor = numFastaSeqs / processors;
260                                 int startIndex =  pid * numSeqsPerProcessor;
261                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
262                                 
263                                 //align your part
264                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBad, outMPIBadAccnos, MPIPos, badSeqNames);
265
266                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBad); MPI_File_close(&outMPIBadAccnos); return 0; }
267                                 
268                                 //send bad list 
269                                 int badSize = badSeqNames.size();
270                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
271                                 
272                                 /*
273                                 set<string>::iterator it;
274                                 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
275                                         string name = *it;
276                                         int length = name.length();
277                                         char* buf2 = new char[length];
278                                         memcpy(buf2, name.c_str(), length);
279                                         
280                                         MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
281                                         MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
282                                 }*/
283                         }
284                         
285                         //close files 
286                         MPI_File_close(&inMPI);
287                         MPI_File_close(&outMPIGood);
288                         MPI_File_close(&outMPIBad);
289                         MPI_File_close(&outMPIBadAccnos);
290                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
291                                         
292 #else
293                                         
294         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
295                         if(processors == 1){
296                                 ifstream inFASTA;
297                                 openInputFile(fastafile, inFASTA);
298                                 getNumSeqs(inFASTA, numFastaSeqs);
299                                 inFASTA.close();
300                                 
301                                 lines.push_back(new linePair(0, numFastaSeqs));
302                                 
303                                 driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
304                                 
305                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
306                                 
307                         }else{
308                                 vector<int> positions;
309                                 processIDS.resize(0);
310                                 
311                                 ifstream inFASTA;
312                                 openInputFile(fastafile, inFASTA);
313                                 
314                                 string input;
315                                 while(!inFASTA.eof()){
316                                         input = getline(inFASTA);
317                                         if (input.length() != 0) {
318                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
319                                         }
320                                 }
321                                 inFASTA.close();
322                                 
323                                 numFastaSeqs = positions.size();
324                                 
325                                 int numSeqsPerProcessor = numFastaSeqs / processors;
326                                 
327                                 for (int i = 0; i < processors; i++) {
328                                         long int startPos = positions[ i * numSeqsPerProcessor ];
329                                         if(i == processors - 1){
330                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
331                                         }
332                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
333                                 }
334                                 
335                                 createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); 
336                                 
337                                 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
338                                 rename((badSeqFile + toString(processIDS[0]) + ".temp").c_str(), badSeqFile.c_str());
339                                 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
340                                 
341                                 //append alignment and report files
342                                 for(int i=1;i<processors;i++){
343                                         appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
344                                         remove((goodSeqFile + toString(processIDS[i]) + ".temp").c_str());
345                                         
346                                         appendFiles((badSeqFile + toString(processIDS[i]) + ".temp"), badSeqFile);
347                                         remove((badSeqFile + toString(processIDS[i]) + ".temp").c_str());
348                                         
349                                         appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
350                                         remove((badAccnosFile + toString(processIDS[i]) + ".temp").c_str());
351                                 }
352                                 
353                                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
354                                 
355                                 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
356                                 ifstream inBad;
357                                 int ableToOpen = openInputFile(badAccnosFile, inBad, "no error");
358                                 
359                                 if (ableToOpen == 0) {
360                                         badSeqNames.clear();
361                                         string tempName;
362                                         while (!inBad.eof()) {
363                                                 inBad >> tempName; gobble(inBad);
364                                                 badSeqNames.insert(tempName);
365                                         }
366                                         inBad.close();
367                                 }
368                         }
369         #else
370                         ifstream inFASTA;
371                         openInputFile(fastafile, inFASTA);
372                         getNumSeqs(inFASTA, numFastaSeqs);
373                         inFASTA.close();
374                         
375                         lines.push_back(new linePair(0, numFastaSeqs));
376                         
377                         driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames);
378                         
379                         if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
380                         
381         #endif
382
383 #endif          
384
385                 #ifdef USE_MPI
386                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
387                                         
388                         if (pid == 0) { //only one process should fix files
389                         
390                                 //read accnos file with all names in it, process 0 just has its names
391                                 MPI_File inMPIAccnos;
392                                 MPI_Offset size;
393                         
394                                 char inFileName[1024];
395                                 strcpy(inFileName, badAccnosFile.c_str());
396                         
397                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
398                                 MPI_File_get_size(inMPIAccnos, &size);
399                         
400                                 char* buffer = new char[size];
401                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
402                         
403                                 string tempBuf = buffer;
404                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
405                                 istringstream iss (tempBuf,istringstream::in);
406
407                                 delete buffer;
408                                 MPI_File_close(&inMPIAccnos);
409                                 
410                                 badSeqNames.clear();
411                                 string tempName;
412                                 while (!iss.eof()) {
413                                         iss >> tempName; gobble(iss);
414                                         badSeqNames.insert(tempName);
415                                 }
416                 #endif
417                                                                                                                                                                         
418                 if(namefile != "" && groupfile != "")   {       
419                         screenNameGroupFile(badSeqNames);       
420                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
421                 }else if(namefile != "")        {       
422                         screenNameGroupFile(badSeqNames);
423                         if (m->control_pressed) {  remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } 
424                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
425                 
426                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
427
428                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
429                 
430                 if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
431                 
432                 #ifdef USE_MPI
433                         }
434                 #endif
435
436                 m->mothurOutEndLine();
437                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
438                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       
439                 m->mothurOut(badSeqFile); m->mothurOutEndLine();        
440                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();     
441                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
442                 m->mothurOutEndLine();
443                 m->mothurOutEndLine();
444
445                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
446                 m->mothurOutEndLine();
447
448                 return 0;
449         }
450         catch(exception& e) {
451                 m->errorOut(e, "ScreenSeqsCommand", "execute");
452                 exit(1);
453         }
454 }
455
456 //***************************************************************************************************************
457
458 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
459         try {
460                 ifstream inputNames;
461                 openInputFile(namefile, inputNames);
462                 set<string> badSeqGroups;
463                 string seqName, seqList, group;
464                 set<string>::iterator it;
465
466                 string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
467                 string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
468                 
469                 outputNames.push_back(goodNameFile);  outputNames.push_back(badNameFile);
470                 
471                 ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
472                 ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
473                 
474                 while(!inputNames.eof()){
475                         if (m->control_pressed) { goodNameOut.close(); badNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); return 0; }
476
477                         inputNames >> seqName >> seqList;
478                         it = badSeqNames.find(seqName);
479                         
480                         if(it != badSeqNames.end()){
481                                 badSeqNames.erase(it);
482                                 badNameOut << seqName << '\t' << seqList << endl;
483                                 if(namefile != ""){
484                                         int start = 0;
485                                         for(int i=0;i<seqList.length();i++){
486                                                 if(seqList[i] == ','){
487                                                         badSeqGroups.insert(seqList.substr(start,i-start));
488                                                         start = i+1;
489                                                 }                                       
490                                         }
491                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
492                                 }
493                         }
494                         else{
495                                 goodNameOut << seqName << '\t' << seqList << endl;
496                         }
497                         gobble(inputNames);
498                 }
499                 inputNames.close();
500                 goodNameOut.close();
501                 badNameOut.close();
502                 
503                 //we were unable to remove some of the bad sequences
504                 if (badSeqNames.size() != 0) {
505                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
506                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
507                                 m->mothurOutEndLine();
508                         }
509                 }
510
511                 if(groupfile != ""){
512                         
513                         ifstream inputGroups;
514                         openInputFile(groupfile, inputGroups);
515
516                         string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
517                         string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
518                         
519                         outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
520                         
521                         ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
522                         ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
523                         
524                         while(!inputGroups.eof()){
525                                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
526
527                                 inputGroups >> seqName >> group;
528
529                                 it = badSeqGroups.find(seqName);
530                                 
531                                 if(it != badSeqGroups.end()){
532                                         badSeqGroups.erase(it);
533                                         badGroupOut << seqName << '\t' << group << endl;
534                                 }
535                                 else{
536                                         goodGroupOut << seqName << '\t' << group << endl;
537                                 }
538                                 gobble(inputGroups);
539                         }
540                         inputGroups.close();
541                         goodGroupOut.close();
542                         badGroupOut.close();
543                         
544                         //we were unable to remove some of the bad sequences
545                         if (badSeqGroups.size() != 0) {
546                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
547                                         m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
548                                         m->mothurOutEndLine();
549                                 }
550                         }
551                 }
552                         
553                 return 0;
554         
555         }
556         catch(exception& e) {
557                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
558                 exit(1);
559         }
560 }
561
562 //***************************************************************************************************************
563
564 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
565         try {
566                 ifstream inputGroups;
567                 openInputFile(groupfile, inputGroups);
568                 string seqName, group;
569                 set<string>::iterator it;
570                 
571                 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
572                 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
573                 
574                 outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
575                 
576                 ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
577                 ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
578                 
579                 while(!inputGroups.eof()){
580                         if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
581
582                         inputGroups >> seqName >> group;
583                         it = badSeqNames.find(seqName);
584                         
585                         if(it != badSeqNames.end()){
586                                 badSeqNames.erase(it);
587                                 badGroupOut << seqName << '\t' << group << endl;
588                         }
589                         else{
590                                 goodGroupOut << seqName << '\t' << group << endl;
591                         }
592                         gobble(inputGroups);
593                 }
594                 
595                 if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
596
597                 //we were unable to remove some of the bad sequences
598                 if (badSeqNames.size() != 0) {
599                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
600                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
601                                 m->mothurOutEndLine();
602                         }
603                 }
604                 
605                 inputGroups.close();
606                 goodGroupOut.close();
607                 badGroupOut.close();
608                 
609                 if (m->control_pressed) { remove(goodGroupFile.c_str()); remove(badGroupFile.c_str());  }
610
611                 
612                 return 0;
613         
614         }
615         catch(exception& e) {
616                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
617                 exit(1);
618         }
619 }
620
621 //***************************************************************************************************************
622
623 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
624         try {
625                 ifstream inputAlignReport;
626                 openInputFile(alignreport, inputAlignReport);
627                 string seqName, group;
628                 set<string>::iterator it;
629                 
630                 string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
631                 string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
632                 
633                 outputNames.push_back(goodAlignReportFile);  outputNames.push_back(badAlignReportFile);
634                 
635                 ofstream goodAlignReportOut;    openOutputFile(goodAlignReportFile, goodAlignReportOut);
636                 ofstream badAlignReportOut;             openOutputFile(badAlignReportFile, badAlignReportOut);          
637
638                 while (!inputAlignReport.eof()) {               //      need to copy header
639                         char c = inputAlignReport.get();
640                         goodAlignReportOut << c;
641                         badAlignReportOut << c;
642                         if (c == 10 || c == 13){        break;  }       
643                 }
644
645                 while(!inputAlignReport.eof()){
646                         if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
647
648                         inputAlignReport >> seqName;
649                         it = badSeqNames.find(seqName);
650                         string line;            
651                         while (!inputAlignReport.eof()) {               //      need to copy header
652                                 char c = inputAlignReport.get();
653                                 line += c;
654                                 if (c == 10 || c == 13){        break;  }       
655                         }
656                         
657                         if(it != badSeqNames.end()){
658                                 badSeqNames.erase(it);
659                                 badAlignReportOut << seqName << '\t' << line;
660                         }
661                         else{
662                                 goodAlignReportOut << seqName << '\t' << line;
663                         }
664                         gobble(inputAlignReport);
665                 }
666                 
667                 if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
668
669                 //we were unable to remove some of the bad sequences
670                 if (badSeqNames.size() != 0) {
671                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
672                                 m->mothurOut("Your file does not include the sequence " + *it + " please correct."); 
673                                 m->mothurOutEndLine();
674                         }
675                 }
676
677                 inputAlignReport.close();
678                 goodAlignReportOut.close();
679                 badAlignReportOut.close();
680                                 
681                 if (m->control_pressed) {  remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
682                 
683                 return 0;
684         
685         }
686         catch(exception& e) {
687                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
688                 exit(1);
689         }
690         
691 }
692 //**********************************************************************************************************************
693
694 int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, string badAccnosFName, string filename, set<string>& badSeqNames){
695         try {
696                 ofstream goodFile;
697                 openOutputFile(goodFName, goodFile);
698                 
699                 ofstream badFile;
700                 openOutputFile(badFName, badFile);
701                 
702                 ofstream badAccnosFile;
703                 openOutputFile(badAccnosFName, badAccnosFile);
704                 
705                 ifstream inFASTA;
706                 openInputFile(filename, inFASTA);
707
708                 inFASTA.seekg(line->start);
709         
710                 for(int i=0;i<line->numSeqs;i++){
711                         
712                         if (m->control_pressed) {  return 0; }
713                         
714                         Sequence currSeq(inFASTA);
715                         if (currSeq.getName() != "") {
716                                 bool goodSeq = 1;               //      innocent until proven guilty
717                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
718                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
719                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
720                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
721                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
722                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
723                                 
724                                 if(goodSeq == 1){
725                                         currSeq.printSequence(goodFile);        
726                                 }
727                                 else{
728                                         currSeq.printSequence(badFile); 
729                                         badAccnosFile << currSeq.getName() << endl;
730                                         badSeqNames.insert(currSeq.getName());
731                                 }
732                         }
733                         gobble(inFASTA);
734                 }
735                 
736                         
737                 goodFile.close();
738                 inFASTA.close();
739                 badFile.close();
740                 badAccnosFile.close();
741                 
742                 return 1;
743         }
744         catch(exception& e) {
745                 m->errorOut(e, "ScreenSeqsCommand", "driver");
746                 exit(1);
747         }
748 }
749 //**********************************************************************************************************************
750 #ifdef USE_MPI
751 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badFile, MPI_File& badAccnosFile, vector<long>& MPIPos, set<string>& badSeqNames){
752         try {
753                 string outputString = "";
754                 MPI_Status statusGood; 
755                 MPI_Status statusBad; 
756                 MPI_Status statusBadAccnos; 
757                 MPI_Status status; 
758                 int pid;
759                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
760
761                 for(int i=0;i<num;i++){
762                 
763                         if (m->control_pressed) {  return 0; }
764                         
765                         //read next sequence
766                         int length = MPIPos[start+i+1] - MPIPos[start+i];
767
768                         char* buf4 = new char[length];
769                         memcpy(buf4, outputString.c_str(), length);
770
771                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
772                         
773                         string tempBuf = buf4;  delete buf4;
774                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
775                         istringstream iss (tempBuf,istringstream::in);
776                         
777                         Sequence currSeq(iss);                  
778                         
779                         //process seq
780                         if (currSeq.getName() != "") {
781                                 bool goodSeq = 1;               //      innocent until proven guilty
782                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
783                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
784                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
785                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
786                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
787                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
788                                 
789                                 if(goodSeq == 1){
790                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
791                                 
792                                         //print good seq
793                                         length = outputString.length();
794                                         char* buf2 = new char[length];
795                                         memcpy(buf2, outputString.c_str(), length);
796                                         
797                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
798                                         delete buf2;
799                                 }
800                                 else{
801                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
802                                 
803                                         //print bad seq to fasta
804                                         length = outputString.length();
805                                         char* buf2 = new char[length];
806                                         memcpy(buf2, outputString.c_str(), length);
807                                         
808                                         MPI_File_write_shared(badFile, buf2, length, MPI_CHAR, &statusBad);
809                                         delete buf2;
810
811                                         badSeqNames.insert(currSeq.getName());
812                                         
813                                         //write to bad accnos file
814                                         outputString = currSeq.getName() + "\n";
815                                 
816                                         length = outputString.length();
817                                         char* buf3 = new char[length];
818                                         memcpy(buf3, outputString.c_str(), length);
819                                         
820                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
821                                         delete buf3;
822                                 }
823                         }
824                 }
825                                 
826                 return 1;
827         }
828         catch(exception& e) {
829                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
830                 exit(1);
831         }
832 }
833 #endif
834 /**************************************************************************************************/
835
836 int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, string badAccnos, string filename, set<string>& badSeqNames) {
837         try {
838 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
839                 int process = 0;
840                 int exitCommand = 1;
841                 
842                 //loop through and create all the processes you want
843                 while (process != processors) {
844                         int pid = fork();
845                         
846                         if (pid > 0) {
847                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
848                                 process++;
849                         }else if (pid == 0){
850                                 exitCommand = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
851                                 exit(0);
852                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
853                 }
854                 
855                 //force parent to wait until all the processes are done
856                 for (int i=0;i<processors;i++) { 
857                         int temp = processIDS[i];
858                         wait(&temp);
859                 }
860                 
861                 return exitCommand;
862 #endif          
863         }
864         catch(exception& e) {
865                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
866                 exit(1);
867         }
868 }
869
870 //***************************************************************************************************************
871
872