]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
PDS: update citation fields
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
22                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
23                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
24                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25                 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
26                 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
27                 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29                 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
30                 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
31                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string ScreenSeqsCommand::getHelpString(){      
45         try {
46                 string helpString = "";
47                 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
48                 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
49                 helpString += "The fasta parameter is required.\n";
50                 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
51                 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
52                 helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
53                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
54                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
55                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
56                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
57                 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
58                 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
59                 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
60                 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
61                 helpString += "The screen.seqs command should be in the following format: \n";
62                 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n";
63                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
64                 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
65                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
66                 return helpString;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74 string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName=""){       
75         try {
76         string outputFileName = "";
77                 map<string, vector<string> >::iterator it;
78         
79         //is this a type this command creates
80         it = outputTypes.find(type);
81         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
82         else {
83             if (type == "fasta")            {   outputFileName =  "good" + m->getExtension(inputName);   }
84             else if (type == "taxonomy")    {   outputFileName =  "good" + m->getExtension(inputName);   }
85             else if (type == "name")        {   outputFileName =  "good" + m->getExtension(inputName);   }
86             else if (type == "group")       {   outputFileName =  "good" + m->getExtension(inputName);   }
87             else if (type == "accnos")      {   outputFileName =  "bad.accnos";   }
88             else if (type == "qfile")       {   outputFileName =  "good" + m->getExtension(inputName);   }
89             else if (type == "alignreport") {   outputFileName =  "good.align.report";                   }
90             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
91         }
92         return outputFileName;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "ScreenSeqsCommand", "getOutputFileNameTag");
96                 exit(1);
97         }
98 }
99
100 //**********************************************************************************************************************
101 ScreenSeqsCommand::ScreenSeqsCommand(){ 
102         try {
103                 abort = true; calledHelp = true; 
104                 setParameters();
105                 vector<string> tempOutNames;
106                 outputTypes["fasta"] = tempOutNames;
107                 outputTypes["name"] = tempOutNames;
108                 outputTypes["group"] = tempOutNames;
109                 outputTypes["alignreport"] = tempOutNames;
110                 outputTypes["accnos"] = tempOutNames;
111                 outputTypes["qfile"] = tempOutNames;
112                 outputTypes["taxonomy"] = tempOutNames;
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
116                 exit(1);
117         }
118 }
119 //***************************************************************************************************************
120
121 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
122         try {
123                 abort = false; calledHelp = false;   
124                 
125                 //allow user to run help
126                 if(option == "help") { help(); abort = true; calledHelp = true; }
127                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
128                 
129                 else {
130                         vector<string> myArray = setParameters();
131                         
132                         OptionParser parser(option);
133                         map<string,string> parameters = parser.getParameters();
134                         
135                         ValidParameters validParameter("screen.seqs");
136                         map<string,string>::iterator it;
137                         
138                         //check to make sure all parameters are valid for command
139                         for (it = parameters.begin(); it != parameters.end(); it++) { 
140                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
141                         }
142                         
143                         //initialize outputTypes
144                         vector<string> tempOutNames;
145                         outputTypes["fasta"] = tempOutNames;
146                         outputTypes["name"] = tempOutNames;
147                         outputTypes["group"] = tempOutNames;
148                         outputTypes["alignreport"] = tempOutNames;
149                         outputTypes["accnos"] = tempOutNames;
150                         outputTypes["qfile"] = tempOutNames;
151                         outputTypes["taxonomy"] = tempOutNames;
152                         
153                         //if the user changes the input directory command factory will send this info to us in the output parameter 
154                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
155                         if (inputDir == "not found"){   inputDir = "";          }
156                         else {
157                                 string path;
158                                 it = parameters.find("fasta");
159                                 //user has given a template file
160                                 if(it != parameters.end()){ 
161                                         path = m->hasPath(it->second);
162                                         //if the user has not given a path then, add inputdir. else leave path alone.
163                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
164                                 }
165                                 
166                                 it = parameters.find("group");
167                                 //user has given a template file
168                                 if(it != parameters.end()){ 
169                                         path = m->hasPath(it->second);
170                                         //if the user has not given a path then, add inputdir. else leave path alone.
171                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
172                                 }
173                                 
174                                 it = parameters.find("name");
175                                 //user has given a template file
176                                 if(it != parameters.end()){ 
177                                         path = m->hasPath(it->second);
178                                         //if the user has not given a path then, add inputdir. else leave path alone.
179                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
180                                 }
181                                 
182                                 it = parameters.find("alignreport");
183                                 //user has given a template file
184                                 if(it != parameters.end()){ 
185                                         path = m->hasPath(it->second);
186                                         //if the user has not given a path then, add inputdir. else leave path alone.
187                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
188                                 }
189                                 
190                                 it = parameters.find("qfile");
191                                 //user has given a template file
192                                 if(it != parameters.end()){ 
193                                         path = m->hasPath(it->second);
194                                         //if the user has not given a path then, add inputdir. else leave path alone.
195                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
196                                 }
197                                 
198                                 it = parameters.find("taxonomy");
199                                 //user has given a template file
200                                 if(it != parameters.end()){ 
201                                         path = m->hasPath(it->second);
202                                         //if the user has not given a path then, add inputdir. else leave path alone.
203                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
204                                 }
205                         }
206
207                         //check for required parameters
208                         fastafile = validParameter.validFile(parameters, "fasta", true);
209                         if (fastafile == "not found") {                         
210                                 fastafile = m->getFastaFile(); 
211                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
212                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
213                         }
214                         else if (fastafile == "not open") { abort = true; }
215                         else { m->setFastaFile(fastafile); }
216         
217                         groupfile = validParameter.validFile(parameters, "group", true);
218                         if (groupfile == "not open") { abort = true; }  
219                         else if (groupfile == "not found") { groupfile = ""; }
220                         else { m->setGroupFile(groupfile); }
221                         
222                         qualfile = validParameter.validFile(parameters, "qfile", true);
223                         if (qualfile == "not open") { abort = true; }   
224                         else if (qualfile == "not found") { qualfile = ""; }
225                         else { m->setQualFile(qualfile); }
226                         
227                         namefile = validParameter.validFile(parameters, "name", true);
228                         if (namefile == "not open") { namefile = ""; abort = true; }
229                         else if (namefile == "not found") { namefile = ""; }    
230                         else { m->setNameFile(namefile); }
231                         
232                         alignreport = validParameter.validFile(parameters, "alignreport", true);
233                         if (alignreport == "not open") { abort = true; }
234                         else if (alignreport == "not found") { alignreport = ""; }
235                         
236                         taxonomy = validParameter.validFile(parameters, "taxonomy", true);
237                         if (taxonomy == "not open") { abort = true; }
238                         else if (taxonomy == "not found") { taxonomy = ""; }    
239                         
240                         //if the user changes the output directory command factory will send this info to us in the output parameter 
241                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
242                                 outputDir = ""; 
243                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
244                         }
245
246                         //check for optional parameter and set defaults
247                         // ...at some point should added some additional type checking...
248                         string temp;
249                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
250                         m->mothurConvert(temp, startPos); 
251                 
252                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
253                         m->mothurConvert(temp, endPos);  
254
255                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
256                         m->mothurConvert(temp, maxAmbig);  
257
258                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
259                         m->mothurConvert(temp, maxHomoP);  
260
261                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
262                         m->mothurConvert(temp, minLength); 
263                         
264                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
265                         m->mothurConvert(temp, maxLength); 
266                         
267                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
268                         m->setProcessors(temp);
269                         m->mothurConvert(temp, processors);
270                         
271                         temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
272                         if (temp == "not found"){       temp = "none";          }
273                         m->splitAtDash(temp, optimize);         
274                         
275                         //check for invalid optimize options
276                         set<string> validOptimizers;
277                         validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
278                         for (int i = 0; i < optimize.size(); i++) { 
279                                 if (validOptimizers.count(optimize[i]) == 0) { 
280                                         m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
281                                         optimize.erase(optimize.begin()+i);
282                                         i--;
283                                 }
284                         }
285                         
286                         if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
287                         
288                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
289                         m->mothurConvert(temp, criteria); 
290                         
291                         if (namefile == "") {
292                                 vector<string> files; files.push_back(fastafile);
293                                 parser.getNameFile(files);
294                         }
295                 }
296
297         }
298         catch(exception& e) {
299                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
300                 exit(1);
301         }
302 }
303 //***************************************************************************************************************
304
305 int ScreenSeqsCommand::execute(){
306         try{
307                 
308                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
309                 
310                 //if the user want to optimize we need to know the 90% mark
311                 vector<unsigned long long> positions;
312                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
313                         //use the namefile to optimize correctly
314                         if (namefile != "") { nameMap = m->readNames(namefile); }
315                         getSummary(positions); 
316                 } 
317                 else { 
318                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
319                 positions = m->divideFile(fastafile, processors);
320                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
321                         #else 
322                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
323                 else {
324                     int numFastaSeqs = 0;
325                     positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
326                     if (positions.size() < processors) { processors = positions.size(); }
327                 
328                     //figure out how many sequences you have to process
329                     int numSeqsPerProcessor = numFastaSeqs / processors;
330                     for (int i = 0; i < processors; i++) {
331                         int startIndex =  i * numSeqsPerProcessor;
332                         if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
333                         lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
334                     }
335                 }
336                         #endif
337                 }
338                                         
339                 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
340                 string badAccnosFile =  outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
341                 
342                 int numFastaSeqs = 0;
343                 set<string> badSeqNames;
344                 int start = time(NULL);
345         
346 #ifdef USE_MPI  
347                         int pid, numSeqsPerProcessor; 
348                         int tag = 2001;
349                         vector<unsigned long long> MPIPos;
350                         
351                         MPI_Status status; 
352                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
353                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
354         
355                         MPI_File inMPI;
356                         MPI_File outMPIGood;
357                         MPI_File outMPIBadAccnos;
358                         
359                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
360                         int inMode=MPI_MODE_RDONLY; 
361                         
362                         char outGoodFilename[1024];
363                         strcpy(outGoodFilename, goodSeqFile.c_str());
364
365                         char outBadAccnosFilename[1024];
366                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
367
368                         char inFileName[1024];
369                         strcpy(inFileName, fastafile.c_str());
370                         
371                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
372                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
373                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
374                         
375                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
376                         
377                         if (pid == 0) { //you are the root process 
378                                 
379                                 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
380                                 
381                                 //send file positions to all processes
382                                 for(int i = 1; i < processors; i++) { 
383                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
384                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
385                                 }
386                                 
387                                 //figure out how many sequences you have to align
388                                 numSeqsPerProcessor = numFastaSeqs / processors;
389                                 int startIndex =  pid * numSeqsPerProcessor;
390                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
391
392                                 //align your part
393                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
394
395                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
396
397                                 for (int i = 1; i < processors; i++) {
398                                         //get bad lists
399                                         int badSize;
400                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
401                                 }
402                         }else{ //you are a child process
403                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
404                                 MPIPos.resize(numFastaSeqs+1);
405                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
406
407                                 //figure out how many sequences you have to align
408                                 numSeqsPerProcessor = numFastaSeqs / processors;
409                                 int startIndex =  pid * numSeqsPerProcessor;
410                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
411
412                                 //align your part
413                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
414
415                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
416                                 
417                                 //send bad list 
418                                 int badSize = badSeqNames.size();
419                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
420                         }
421                         
422                         //close files 
423                         MPI_File_close(&inMPI);
424                         MPI_File_close(&outMPIGood);
425                         MPI_File_close(&outMPIBadAccnos);
426                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
427                                         
428 #else
429         if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);       }       
430         else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
431         
432         if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
433 #endif          
434
435                 #ifdef USE_MPI
436                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
437                                         
438                         if (pid == 0) { //only one process should fix files
439                         
440                                 //read accnos file with all names in it, process 0 just has its names
441                                 MPI_File inMPIAccnos;
442                                 MPI_Offset size;
443                         
444                                 char inFileName[1024];
445                                 strcpy(inFileName, badAccnosFile.c_str());
446                         
447                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
448                                 MPI_File_get_size(inMPIAccnos, &size);
449                         
450                                 char* buffer = new char[size];
451                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
452                         
453                                 string tempBuf = buffer;
454                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
455                                 istringstream iss (tempBuf,istringstream::in);
456
457                                 delete buffer;
458                                 MPI_File_close(&inMPIAccnos);
459                                 
460                                 badSeqNames.clear();
461                                 string tempName;
462                                 while (!iss.eof()) {
463                                         iss >> tempName; m->gobble(iss);
464                                         badSeqNames.insert(tempName);
465                                 }
466                 #endif
467                                                                                                                                                                         
468                 if(namefile != "" && groupfile != "")   {       
469                         screenNameGroupFile(badSeqNames);       
470                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile); return 0; }
471                 }else if(namefile != "")        {       
472                         screenNameGroupFile(badSeqNames);
473                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile);  return 0; }   
474                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
475                 
476                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
477
478                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
479                 if(qualfile != "")                                              {       screenQual(badSeqNames);                        }
480                 if(taxonomy != "")                                              {       screenTaxonomy(badSeqNames);            }
481                 
482                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
483                 
484                 #ifdef USE_MPI
485                         }
486                 #endif
487
488                 m->mothurOutEndLine();
489                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
490                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       outputTypes["fasta"].push_back(goodSeqFile);
491                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();      outputTypes["accnos"].push_back(badAccnosFile);
492                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
493                 m->mothurOutEndLine();
494                 m->mothurOutEndLine();
495                 
496                 //set fasta file as new current fastafile
497                 string current = "";
498                 itTypes = outputTypes.find("fasta");
499                 if (itTypes != outputTypes.end()) {
500                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
501                 }
502                 
503                 itTypes = outputTypes.find("name");
504                 if (itTypes != outputTypes.end()) {
505                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
506                 }
507                 
508                 itTypes = outputTypes.find("group");
509                 if (itTypes != outputTypes.end()) {
510                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
511                 }
512                 
513                 itTypes = outputTypes.find("qfile");
514                 if (itTypes != outputTypes.end()) {
515                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
516                 }
517                 
518                 itTypes = outputTypes.find("taxonomy");
519                 if (itTypes != outputTypes.end()) {
520                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
521                 }
522
523                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
524                 m->mothurOutEndLine();
525
526                 return 0;
527         }
528         catch(exception& e) {
529                 m->errorOut(e, "ScreenSeqsCommand", "execute");
530                 exit(1);
531         }
532 }
533
534 //***************************************************************************************************************
535
536 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
537         try {
538                 ifstream inputNames;
539                 m->openInputFile(namefile, inputNames);
540                 set<string> badSeqGroups;
541                 string seqName, seqList, group;
542                 set<string>::iterator it;
543
544                 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
545                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
546                 
547                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
548                 
549                 while(!inputNames.eof()){
550                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); m->mothurRemove(goodNameFile);  return 0; }
551
552                         inputNames >> seqName >> seqList;
553                         it = badSeqNames.find(seqName);
554                                 
555                         if(it != badSeqNames.end()){
556                                 badSeqNames.erase(it);
557                                 
558                                 if(namefile != ""){
559                                         int start = 0;
560                                         for(int i=0;i<seqList.length();i++){
561                                                 if(seqList[i] == ','){
562                                                         badSeqGroups.insert(seqList.substr(start,i-start));
563                                                         start = i+1;
564                                                 }                                       
565                                         }
566                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
567                                 }
568                         }
569                         else{
570                                 goodNameOut << seqName << '\t' << seqList << endl;
571                         }
572                         m->gobble(inputNames);
573                 }
574                 inputNames.close();
575                 goodNameOut.close();
576         
577                 //we were unable to remove some of the bad sequences
578                 if (badSeqNames.size() != 0) {
579                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
580                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
581                                 m->mothurOutEndLine();
582                         }
583                 }
584
585                 if(groupfile != ""){
586                         
587                         ifstream inputGroups;
588                         m->openInputFile(groupfile, inputGroups);
589
590                         string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
591                         outputNames.push_back(goodGroupFile);   outputTypes["group"].push_back(goodGroupFile);
592                         
593                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
594                         
595                         while(!inputGroups.eof()){
596                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile);  m->mothurRemove(goodGroupFile); return 0; }
597
598                                 inputGroups >> seqName >> group;
599                                 
600                                 it = badSeqGroups.find(seqName);
601                                 
602                                 if(it != badSeqGroups.end()){
603                                         badSeqGroups.erase(it);
604                                 }
605                                 else{
606                                         goodGroupOut << seqName << '\t' << group << endl;
607                                 }
608                                 m->gobble(inputGroups);
609                         }
610                         inputGroups.close();
611                         goodGroupOut.close();
612                         
613                         //we were unable to remove some of the bad sequences
614                         if (badSeqGroups.size() != 0) {
615                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
616                                         m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
617                                         m->mothurOutEndLine();
618                                 }
619                         }
620                 }
621                 
622                 
623                 return 0;
624         
625         }
626         catch(exception& e) {
627                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
628                 exit(1);
629         }
630 }
631 //***************************************************************************************************************
632 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
633         try {
634                 
635                 vector<int> startPosition;
636                 vector<int> endPosition;
637                 vector<int> seqLength;
638                 vector<int> ambigBases;
639                 vector<int> longHomoPolymer;
640                 
641         vector<unsigned long long> positions;
642 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
643                 positions = m->divideFile(fastafile, processors);
644                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }   
645 #else
646                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
647         else {
648             int numFastaSeqs = 0;
649             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
650             if (positions.size() < processors) { processors = positions.size(); }
651             
652             //figure out how many sequences you have to process
653             int numSeqsPerProcessor = numFastaSeqs / processors;
654             for (int i = 0; i < processors; i++) {
655                 int startIndex =  i * numSeqsPerProcessor;
656                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
657                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
658             }
659         }
660 #endif
661                 
662 #ifdef USE_MPI
663                 int pid;
664                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
665                 
666                 if (pid == 0) { 
667                         driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
668 #else
669                 int numSeqs = 0;
670                 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
671                         if(processors == 1){
672                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
673                         }else{
674                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile); 
675                         }
676                                 
677                         if (m->control_pressed) {  return 0; }
678                 //#else
679                 //      numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
680                 //      if (m->control_pressed) {  return 0; }
681                 //#endif
682 #endif
683                 sort(startPosition.begin(), startPosition.end());
684                 sort(endPosition.begin(), endPosition.end());
685                 sort(seqLength.begin(), seqLength.end());
686                 sort(ambigBases.begin(), ambigBases.end());
687                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
688                 
689                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
690                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
691                 
692                 for (int i = 0; i < optimize.size(); i++) {
693                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
694                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
695                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
696                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
697                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
698                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
699                 }
700
701 #ifdef USE_MPI
702                 }
703                 
704                 MPI_Status status; 
705                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
706                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
707                         
708                 if (pid == 0) { 
709                         //send file positions to all processes
710                         for(int i = 1; i < processors; i++) { 
711                                 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
712                                 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
713                                 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
714                                 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
715                                 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
716                                 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
717                         }
718                 }else {
719                         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
720                         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
721                         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
722                         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
723                         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
724                         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
725                 }
726                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
727 #endif
728                 return 0;
729         }
730         catch(exception& e) {
731                 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
732                 exit(1);
733         }
734 }
735 /**************************************************************************************/
736 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {    
737         try {
738                 
739                 ifstream in;
740                 m->openInputFile(filename, in);
741                                 
742                 in.seekg(filePos.start);
743
744                 bool done = false;
745                 int count = 0;
746         
747                 while (!done) {
748                                 
749                         if (m->control_pressed) { in.close(); return 1; }
750                                         
751                         Sequence current(in); m->gobble(in);
752         
753                         if (current.getName() != "") {
754                                 int num = 1;
755                                 if (namefile != "") {
756                                         //make sure this sequence is in the namefile, else error 
757                                         map<string, int>::iterator it = nameMap.find(current.getName());
758                                         
759                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
760                                         else { num = it->second; }
761                                 }
762                                 
763                                 //for each sequence this sequence represents
764                                 for (int i = 0; i < num; i++) {
765                                         startPosition.push_back(current.getStartPos());
766                                         endPosition.push_back(current.getEndPos());
767                                         seqLength.push_back(current.getNumBases());
768                                         ambigBases.push_back(current.getAmbigBases());
769                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
770                                 }
771                                 
772                                 count++;
773                         }
774                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
775                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
776                                 unsigned long long pos = in.tellg();
777                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
778                         #else
779                                 if (in.eof()) { break; }
780                         #endif
781                         
782                 }
783                 
784                 in.close();
785                 
786                 return count;
787         }
788         catch(exception& e) {
789                 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
790                 exit(1);
791         }
792 }
793 /**************************************************************************************************/
794 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
795         try {
796         
797         int process = 1;
798                 int num = 0;
799                 vector<int> processIDS;
800
801 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
802                                 
803                 //loop through and create all the processes you want
804                 while (process != processors) {
805                         int pid = fork();
806                         
807                         if (pid > 0) {
808                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
809                                 process++;
810                         }else if (pid == 0){
811                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
812                                 
813                                 //pass numSeqs to parent
814                                 ofstream out;
815                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
816                                 m->openOutputFile(tempFile, out);
817                                 
818                                 out << num << endl;
819                                 out << startPosition.size() << endl;
820                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
821                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
822                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
823                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
824                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
825                                 
826                                 out.close();
827                                 
828                                 exit(0);
829                         }else { 
830                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
831                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
832                                 exit(0);
833                         }
834                 }
835                 
836                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
837                 
838                 //force parent to wait until all the processes are done
839                 for (int i=0;i<processIDS.size();i++) { 
840                         int temp = processIDS[i];
841                         wait(&temp);
842                 }
843                 
844                 //parent reads in and combine Filter info
845                 for (int i = 0; i < processIDS.size(); i++) {
846                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
847                         ifstream in;
848                         m->openInputFile(tempFilename, in);
849                         
850                         int temp, tempNum;
851                         in >> tempNum; m->gobble(in); num += tempNum;
852                         in >> tempNum; m->gobble(in);
853                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
854                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
855                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
856                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
857                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
858                                 
859                         in.close();
860                         m->mothurRemove(tempFilename);
861                 }
862                 
863                 
864 #else 
865         //////////////////////////////////////////////////////////////////////////////////////////////////////
866                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
867                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
868                 //Taking advantage of shared memory to allow both threads to add info to vectors.
869                 //////////////////////////////////////////////////////////////////////////////////////////////////////
870                 
871                 vector<sumData*> pDataArray; 
872                 DWORD   dwThreadIdArray[processors-1];
873                 HANDLE  hThreadArray[processors-1]; 
874                 
875                 //Create processor worker threads.
876                 for( int i=0; i<processors-1; i++ ){
877             
878                         // Allocate memory for thread data.
879                         sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
880                         pDataArray.push_back(tempSum);
881                         
882                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
883                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
884                         hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
885                 }
886                 
887         //do your part
888                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
889          
890                 //Wait until all threads have terminated.
891                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
892                 
893                 //Close all thread handles and free memory allocations.
894                 for(int i=0; i < pDataArray.size(); i++){
895                         num += pDataArray[i]->count;
896             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
897                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
898             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
899             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
900             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
901                         CloseHandle(hThreadArray[i]);
902                         delete pDataArray[i];
903                 }
904
905 #endif          
906         return num;
907         }
908         catch(exception& e) {
909                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
910                 exit(1);
911         }
912 }
913
914 //***************************************************************************************************************
915
916 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
917         try {
918                 ifstream inputGroups;
919                 m->openInputFile(groupfile, inputGroups);
920                 string seqName, group;
921                 set<string>::iterator it;
922                 
923                 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
924         outputNames.push_back(goodGroupFile);  outputTypes["group"].push_back(goodGroupFile);
925                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
926                 
927                 while(!inputGroups.eof()){
928                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
929
930                         inputGroups >> seqName >> group;
931                         it = badSeqNames.find(seqName);
932                         
933                         if(it != badSeqNames.end()){
934                                 badSeqNames.erase(it);
935                         }
936                         else{
937                                 goodGroupOut << seqName << '\t' << group << endl;
938                         }
939                         m->gobble(inputGroups);
940                 }
941                 
942                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); m->mothurRemove(goodGroupFile);  return 0; }
943
944                 //we were unable to remove some of the bad sequences
945                 if (badSeqNames.size() != 0) {
946                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
947                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
948                                 m->mothurOutEndLine();
949                         }
950                 }
951                 
952                 inputGroups.close();
953                 goodGroupOut.close();
954                 
955                 if (m->control_pressed) { m->mothurRemove(goodGroupFile);   }
956                 
957                 return 0;
958         
959         }
960         catch(exception& e) {
961                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
962                 exit(1);
963         }
964 }
965
966 //***************************************************************************************************************
967
968 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
969         try {
970                 ifstream inputAlignReport;
971                 m->openInputFile(alignreport, inputAlignReport);
972                 string seqName, group;
973                 set<string>::iterator it;
974                 
975                 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + getOutputFileNameTag("alignreport");
976                 outputNames.push_back(goodAlignReportFile);  outputTypes["alignreport"].push_back(goodAlignReportFile);
977                 ofstream goodAlignReportOut;    m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
978
979                 while (!inputAlignReport.eof()) {               //      need to copy header
980                         char c = inputAlignReport.get();
981                         goodAlignReportOut << c;
982                         if (c == 10 || c == 13){        break;  }       
983                 }
984
985                 while(!inputAlignReport.eof()){
986                         if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
987
988                         inputAlignReport >> seqName;
989                         it = badSeqNames.find(seqName);
990                         string line;            
991                         while (!inputAlignReport.eof()) {               //      need to copy header
992                                 char c = inputAlignReport.get();
993                                 line += c;
994                                 if (c == 10 || c == 13){        break;  }       
995                         }
996                         
997                         if(it != badSeqNames.end()){
998                                 badSeqNames.erase(it);
999                         }
1000                         else{
1001                                 goodAlignReportOut << seqName << '\t' << line;
1002                         }
1003                         m->gobble(inputAlignReport);
1004                 }
1005                 
1006                 if (m->control_pressed) { goodAlignReportOut.close();  inputAlignReport.close(); m->mothurRemove(goodAlignReportFile);  return 0; }
1007
1008                 //we were unable to remove some of the bad sequences
1009                 if (badSeqNames.size() != 0) {
1010                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1011                                 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct."); 
1012                                 m->mothurOutEndLine();
1013                         }
1014                 }
1015
1016                 inputAlignReport.close();
1017                 goodAlignReportOut.close();
1018                                 
1019                 if (m->control_pressed) {  m->mothurRemove(goodAlignReportFile);  return 0; }
1020                 
1021                 return 0;
1022         
1023         }
1024         catch(exception& e) {
1025                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
1026                 exit(1);
1027         }
1028         
1029 }
1030 //***************************************************************************************************************
1031
1032 int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
1033         try {
1034                 ifstream input;
1035                 m->openInputFile(taxonomy, input);
1036                 string seqName, tax;
1037                 set<string>::iterator it;
1038                 
1039                 string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + getOutputFileNameTag("taxonomy", taxonomy);
1040                 outputNames.push_back(goodTaxFile);  outputTypes["taxonomy"].push_back(goodTaxFile);
1041                 ofstream goodTaxOut;    m->openOutputFile(goodTaxFile, goodTaxOut);
1042                                 
1043                 while(!input.eof()){
1044                         if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1045                         
1046                         input >> seqName >> tax;
1047                         it = badSeqNames.find(seqName);
1048                         
1049                         if(it != badSeqNames.end()){ badSeqNames.erase(it); }
1050                         else{
1051                                 goodTaxOut << seqName << '\t' << tax << endl;
1052                         }
1053                         m->gobble(input);
1054                 }
1055                 
1056                 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1057                 
1058                 //we were unable to remove some of the bad sequences
1059                 if (badSeqNames.size() != 0) {
1060                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1061                                 m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct."); 
1062                                 m->mothurOutEndLine();
1063                         }
1064                 }
1065                 
1066                 input.close();
1067                 goodTaxOut.close();
1068                 
1069                 if (m->control_pressed) {  m->mothurRemove(goodTaxFile);  return 0; }
1070                 
1071                 return 0;
1072                 
1073         }
1074         catch(exception& e) {
1075                 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
1076                 exit(1);
1077         }
1078         
1079 }
1080 //***************************************************************************************************************
1081
1082 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
1083         try {
1084                 ifstream in;
1085                 m->openInputFile(qualfile, in);
1086                 set<string>::iterator it;
1087                 
1088                 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile);
1089                 outputNames.push_back(goodQualFile);  outputTypes["qfile"].push_back(goodQualFile);
1090                 ofstream goodQual;      m->openOutputFile(goodQualFile, goodQual);
1091                 
1092                 while(!in.eof()){       
1093                         
1094                         if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
1095
1096                         string saveName = "";
1097                         string name = "";
1098                         string scores = "";
1099                         
1100                         in >> name; 
1101                         
1102                         if (name.length() != 0) { 
1103                                 saveName = name.substr(1);
1104                                 while (!in.eof())       {       
1105                                         char c = in.get(); 
1106                                         if (c == 10 || c == 13){        break;  }
1107                                         else { name += c; }     
1108                                 } 
1109                                 m->gobble(in);
1110                         }
1111                         
1112                         while(in){
1113                                 char letter= in.get();
1114                                 if(letter == '>'){      in.putback(letter);     break;  }
1115                                 else{ scores += letter; }
1116                         }
1117                         
1118                         m->gobble(in);
1119                         
1120                         it = badSeqNames.find(saveName);
1121                         
1122                         if(it != badSeqNames.end()){
1123                                 badSeqNames.erase(it);
1124                         }else{                          
1125                                 goodQual << name << endl << scores;
1126                         }
1127                         
1128                         m->gobble(in);
1129                 }
1130                 
1131                 in.close();
1132                 goodQual.close();
1133                 
1134                 //we were unable to remove some of the bad sequences
1135                 if (badSeqNames.size() != 0) {
1136                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1137                                 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct."); 
1138                                 m->mothurOutEndLine();
1139                         }
1140                 }
1141                 
1142                 if (m->control_pressed) {  m->mothurRemove(goodQualFile);  return 0; }
1143                 
1144                 return 0;
1145                 
1146         }
1147         catch(exception& e) {
1148                 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1149                 exit(1);
1150         }
1151         
1152 }
1153 //**********************************************************************************************************************
1154
1155 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1156         try {
1157                 ofstream goodFile;
1158                 m->openOutputFile(goodFName, goodFile);
1159                 
1160                 ofstream badAccnosFile;
1161                 m->openOutputFile(badAccnosFName, badAccnosFile);
1162                 
1163                 ifstream inFASTA;
1164                 m->openInputFile(filename, inFASTA);
1165
1166                 inFASTA.seekg(filePos.start);
1167
1168                 bool done = false;
1169                 int count = 0;
1170         
1171                 while (!done) {
1172                 
1173                         if (m->control_pressed) {  return 0; }
1174                         
1175                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
1176                         if (currSeq.getName() != "") {
1177                                 bool goodSeq = 1;               //      innocent until proven guilty
1178                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1179                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1180                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1181                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1182                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1183                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1184                                 
1185                                 if(goodSeq == 1){
1186                                         currSeq.printSequence(goodFile);        
1187                                 }
1188                                 else{
1189                                         badAccnosFile << currSeq.getName() << endl;
1190                                         badSeqNames.insert(currSeq.getName());
1191                                 }
1192                         count++;
1193                         }
1194                         
1195                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1196                                 unsigned long long pos = inFASTA.tellg();
1197                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
1198                         #else
1199                                 if (inFASTA.eof()) { break; }
1200                         #endif
1201                         
1202                         //report progress
1203                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1204                 }
1205                 //report progress
1206                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1207                 
1208                         
1209                 goodFile.close();
1210                 inFASTA.close();
1211                 badAccnosFile.close();
1212                 
1213                 return count;
1214         }
1215         catch(exception& e) {
1216                 m->errorOut(e, "ScreenSeqsCommand", "driver");
1217                 exit(1);
1218         }
1219 }
1220 //**********************************************************************************************************************
1221 #ifdef USE_MPI
1222 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
1223         try {
1224                 string outputString = "";
1225                 MPI_Status statusGood; 
1226                 MPI_Status statusBadAccnos; 
1227                 MPI_Status status; 
1228                 int pid;
1229                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1230
1231                 for(int i=0;i<num;i++){
1232                 
1233                         if (m->control_pressed) {  return 0; }
1234                         
1235                         //read next sequence
1236                         int length = MPIPos[start+i+1] - MPIPos[start+i];
1237
1238                         char* buf4 = new char[length];
1239
1240                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1241                         
1242                         string tempBuf = buf4;  delete buf4;
1243                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
1244                         istringstream iss (tempBuf,istringstream::in);
1245                         
1246                         Sequence currSeq(iss);                  
1247                         
1248                         //process seq
1249                         if (currSeq.getName() != "") {
1250                                 bool goodSeq = 1;               //      innocent until proven guilty
1251                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1252                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1253                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1254                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1255                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1256                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1257                                 
1258                                 if(goodSeq == 1){
1259                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1260                                 
1261                                         //print good seq
1262                                         length = outputString.length();
1263                                         char* buf2 = new char[length];
1264                                         memcpy(buf2, outputString.c_str(), length);
1265                                         
1266                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1267                                         delete buf2;
1268                                 }
1269                                 else{
1270
1271                                         badSeqNames.insert(currSeq.getName());
1272                                         
1273                                         //write to bad accnos file
1274                                         outputString = currSeq.getName() + "\n";
1275                                 
1276                                         length = outputString.length();
1277                                         char* buf3 = new char[length];
1278                                         memcpy(buf3, outputString.c_str(), length);
1279                                         
1280                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1281                                         delete buf3;
1282                                 }
1283                         }
1284                         
1285                         //report progress
1286                         if((i) % 100 == 0){     m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine();             }
1287                 }
1288                                 
1289                 return 1;
1290         }
1291         catch(exception& e) {
1292                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1293                 exit(1);
1294         }
1295 }
1296 #endif
1297 /**************************************************************************************************/
1298
1299 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1300         try {
1301         
1302         vector<int> processIDS;   
1303         int process = 1;
1304                 int num = 0;
1305
1306 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1307                                 
1308                 //loop through and create all the processes you want
1309                 while (process != processors) {
1310                         int pid = fork();
1311                         
1312                         if (pid > 0) {
1313                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1314                                 process++;
1315                         }else if (pid == 0){
1316                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1317                                 
1318                                 //pass numSeqs to parent
1319                                 ofstream out;
1320                                 string tempFile = filename + toString(getpid()) + ".num.temp";
1321                                 m->openOutputFile(tempFile, out);
1322                                 out << num << endl;
1323                                 out.close();
1324                                 
1325                                 exit(0);
1326                         }else { 
1327                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1328                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1329                                 exit(0);
1330                         }
1331                 }
1332                 
1333         num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
1334         
1335                 //force parent to wait until all the processes are done
1336                 for (int i=0;i<processIDS.size();i++) { 
1337                         int temp = processIDS[i];
1338                         wait(&temp);
1339                 }
1340                 
1341                 for (int i = 0; i < processIDS.size(); i++) {
1342                         ifstream in;
1343                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
1344                         m->openInputFile(tempFile, in);
1345                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1346                         in.close(); m->mothurRemove(tempFile);
1347             
1348             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1349             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1350                         
1351             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1352             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1353                 }
1354                 
1355         //read badSeqs in because root process doesnt know what other "bad" seqs the children found
1356         ifstream inBad;
1357         int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
1358         
1359         if (ableToOpen == 0) {
1360             badSeqNames.clear();
1361             string tempName;
1362             while (!inBad.eof()) {
1363                 inBad >> tempName; m->gobble(inBad);
1364                 badSeqNames.insert(tempName);
1365             }
1366             inBad.close();
1367         }
1368 #else
1369         
1370         //////////////////////////////////////////////////////////////////////////////////////////////////////
1371                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
1372                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1373                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
1374                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1375                 
1376                 vector<sumScreenData*> pDataArray; 
1377                 DWORD   dwThreadIdArray[processors-1];
1378                 HANDLE  hThreadArray[processors-1]; 
1379                 
1380                 //Create processor worker threads.
1381                 for( int i=0; i<processors-1; i++ ){
1382             
1383             string extension = "";
1384             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
1385             
1386                         // Allocate memory for thread data.
1387                         sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
1388                         pDataArray.push_back(tempSum);
1389                         
1390                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1391                         hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1392                 }
1393                 
1394         //do your part
1395         num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
1396         processIDS.push_back(processors-1);
1397         
1398                 //Wait until all threads have terminated.
1399                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1400                 
1401                 //Close all thread handles and free memory allocations.
1402                 for(int i=0; i < pDataArray.size(); i++){
1403                         num += pDataArray[i]->count;
1404             for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it);       }
1405                         CloseHandle(hThreadArray[i]);
1406                         delete pDataArray[i];
1407                 }
1408         
1409         for (int i = 0; i < processIDS.size(); i++) {
1410             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1411             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1412                         
1413             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1414             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1415                 }
1416
1417 #endif  
1418         
1419         return num;
1420         
1421         }
1422         catch(exception& e) {
1423                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1424                 exit(1);
1425         }
1426 }
1427
1428 //***************************************************************************************************************
1429
1430