]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
paralellized screen.seqs for windows.
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
22                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
23                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
24                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25                 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
26                 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
27                 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29                 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
30                 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
31                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string ScreenSeqsCommand::getHelpString(){      
45         try {
46                 string helpString = "";
47                 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
48                 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
49                 helpString += "The fasta parameter is required.\n";
50                 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
51                 helpString += "The start parameter .... The default is -1.\n";
52                 helpString += "The end parameter .... The default is -1.\n";
53                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
54                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
55                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
56                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
57                 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
58                 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
59                 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
60                 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
61                 helpString += "The screen.seqs command should be in the following format: \n";
62                 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n";
63                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
64                 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
65                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
66                 return helpString;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74 ScreenSeqsCommand::ScreenSeqsCommand(){ 
75         try {
76                 abort = true; calledHelp = true; 
77                 setParameters();
78                 vector<string> tempOutNames;
79                 outputTypes["fasta"] = tempOutNames;
80                 outputTypes["name"] = tempOutNames;
81                 outputTypes["group"] = tempOutNames;
82                 outputTypes["alignreport"] = tempOutNames;
83                 outputTypes["accnos"] = tempOutNames;
84                 outputTypes["qfile"] = tempOutNames;
85                 outputTypes["taxonomy"] = tempOutNames;
86         }
87         catch(exception& e) {
88                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
89                 exit(1);
90         }
91 }
92 //***************************************************************************************************************
93
94 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
95         try {
96                 abort = false; calledHelp = false;   
97                 
98                 //allow user to run help
99                 if(option == "help") { help(); abort = true; calledHelp = true; }
100                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
101                 
102                 else {
103                         vector<string> myArray = setParameters();
104                         
105                         OptionParser parser(option);
106                         map<string,string> parameters = parser.getParameters();
107                         
108                         ValidParameters validParameter("screen.seqs");
109                         map<string,string>::iterator it;
110                         
111                         //check to make sure all parameters are valid for command
112                         for (it = parameters.begin(); it != parameters.end(); it++) { 
113                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
114                         }
115                         
116                         //initialize outputTypes
117                         vector<string> tempOutNames;
118                         outputTypes["fasta"] = tempOutNames;
119                         outputTypes["name"] = tempOutNames;
120                         outputTypes["group"] = tempOutNames;
121                         outputTypes["alignreport"] = tempOutNames;
122                         outputTypes["accnos"] = tempOutNames;
123                         outputTypes["qfile"] = tempOutNames;
124                         outputTypes["taxonomy"] = tempOutNames;
125                         
126                         //if the user changes the input directory command factory will send this info to us in the output parameter 
127                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
128                         if (inputDir == "not found"){   inputDir = "";          }
129                         else {
130                                 string path;
131                                 it = parameters.find("fasta");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
137                                 }
138                                 
139                                 it = parameters.find("group");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
145                                 }
146                                 
147                                 it = parameters.find("name");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
153                                 }
154                                 
155                                 it = parameters.find("alignreport");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
161                                 }
162                                 
163                                 it = parameters.find("qfile");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
169                                 }
170                                 
171                                 it = parameters.find("taxonomy");
172                                 //user has given a template file
173                                 if(it != parameters.end()){ 
174                                         path = m->hasPath(it->second);
175                                         //if the user has not given a path then, add inputdir. else leave path alone.
176                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
177                                 }
178                         }
179
180                         //check for required parameters
181                         fastafile = validParameter.validFile(parameters, "fasta", true);
182                         if (fastafile == "not found") {                         
183                                 fastafile = m->getFastaFile(); 
184                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
185                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
186                         }
187                         else if (fastafile == "not open") { abort = true; }
188                         else { m->setFastaFile(fastafile); }
189         
190                         groupfile = validParameter.validFile(parameters, "group", true);
191                         if (groupfile == "not open") { abort = true; }  
192                         else if (groupfile == "not found") { groupfile = ""; }
193                         else { m->setGroupFile(groupfile); }
194                         
195                         qualfile = validParameter.validFile(parameters, "qfile", true);
196                         if (qualfile == "not open") { abort = true; }   
197                         else if (qualfile == "not found") { qualfile = ""; }
198                         else { m->setQualFile(qualfile); }
199                         
200                         namefile = validParameter.validFile(parameters, "name", true);
201                         if (namefile == "not open") { namefile = ""; abort = true; }
202                         else if (namefile == "not found") { namefile = ""; }    
203                         else { m->setNameFile(namefile); }
204                         
205                         alignreport = validParameter.validFile(parameters, "alignreport", true);
206                         if (alignreport == "not open") { abort = true; }
207                         else if (alignreport == "not found") { alignreport = ""; }
208                         
209                         taxonomy = validParameter.validFile(parameters, "taxonomy", true);
210                         if (taxonomy == "not open") { abort = true; }
211                         else if (taxonomy == "not found") { taxonomy = ""; }    
212                         
213                         //if the user changes the output directory command factory will send this info to us in the output parameter 
214                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
215                                 outputDir = ""; 
216                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
217                         }
218
219                         //check for optional parameter and set defaults
220                         // ...at some point should added some additional type checking...
221                         string temp;
222                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
223                         m->mothurConvert(temp, startPos); 
224                 
225                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
226                         m->mothurConvert(temp, endPos);  
227
228                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
229                         m->mothurConvert(temp, maxAmbig);  
230
231                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
232                         m->mothurConvert(temp, maxHomoP);  
233
234                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
235                         m->mothurConvert(temp, minLength); 
236                         
237                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
238                         m->mothurConvert(temp, maxLength); 
239                         
240                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
241                         m->setProcessors(temp);
242                         m->mothurConvert(temp, processors);
243                         
244                         temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
245                         if (temp == "not found"){       temp = "none";          }
246                         m->splitAtDash(temp, optimize);         
247                         
248                         //check for invalid optimize options
249                         set<string> validOptimizers;
250                         validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
251                         for (int i = 0; i < optimize.size(); i++) { 
252                                 if (validOptimizers.count(optimize[i]) == 0) { 
253                                         m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
254                                         optimize.erase(optimize.begin()+i);
255                                         i--;
256                                 }
257                         }
258                         
259                         if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
260                         
261                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
262                         m->mothurConvert(temp, criteria); 
263                         
264                         if (namefile == "") {
265                                 vector<string> files; files.push_back(fastafile);
266                                 parser.getNameFile(files);
267                         }
268                 }
269
270         }
271         catch(exception& e) {
272                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
273                 exit(1);
274         }
275 }
276 //***************************************************************************************************************
277
278 int ScreenSeqsCommand::execute(){
279         try{
280                 
281                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
282                 
283                 //if the user want to optimize we need to know the 90% mark
284                 vector<unsigned long long> positions;
285                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
286                         //use the namefile to optimize correctly
287                         if (namefile != "") { nameMap = m->readNames(namefile); }
288                         getSummary(positions); 
289                 } 
290                 else { 
291                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
292                 positions = m->divideFile(fastafile, processors);
293                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
294                         #else 
295                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
296                 else {
297                     int numFastaSeqs = 0;
298                     positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
299                 
300                     //figure out how many sequences you have to process
301                     int numSeqsPerProcessor = numFastaSeqs / processors;
302                     for (int i = 0; i < processors; i++) {
303                         int startIndex =  i * numSeqsPerProcessor;
304                         if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
305                         lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
306                     }
307                 }
308                         #endif
309                 }
310                                         
311                 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
312                 string badAccnosFile =  outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
313                 
314                 int numFastaSeqs = 0;
315                 set<string> badSeqNames;
316                 int start = time(NULL);
317         
318 #ifdef USE_MPI  
319                         int pid, numSeqsPerProcessor; 
320                         int tag = 2001;
321                         vector<unsigned long long> MPIPos;
322                         
323                         MPI_Status status; 
324                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
325                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
326         
327                         MPI_File inMPI;
328                         MPI_File outMPIGood;
329                         MPI_File outMPIBadAccnos;
330                         
331                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
332                         int inMode=MPI_MODE_RDONLY; 
333                         
334                         char outGoodFilename[1024];
335                         strcpy(outGoodFilename, goodSeqFile.c_str());
336
337                         char outBadAccnosFilename[1024];
338                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
339
340                         char inFileName[1024];
341                         strcpy(inFileName, fastafile.c_str());
342                         
343                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
344                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
345                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
346                         
347                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
348                         
349                         if (pid == 0) { //you are the root process 
350                                 
351                                 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
352                                 
353                                 //send file positions to all processes
354                                 for(int i = 1; i < processors; i++) { 
355                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
356                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
357                                 }
358                                 
359                                 //figure out how many sequences you have to align
360                                 numSeqsPerProcessor = numFastaSeqs / processors;
361                                 int startIndex =  pid * numSeqsPerProcessor;
362                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
363
364                                 //align your part
365                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
366
367                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
368
369                                 for (int i = 1; i < processors; i++) {
370                                         //get bad lists
371                                         int badSize;
372                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
373                                 }
374                         }else{ //you are a child process
375                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
376                                 MPIPos.resize(numFastaSeqs+1);
377                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
378
379                                 //figure out how many sequences you have to align
380                                 numSeqsPerProcessor = numFastaSeqs / processors;
381                                 int startIndex =  pid * numSeqsPerProcessor;
382                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
383
384                                 //align your part
385                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
386
387                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
388                                 
389                                 //send bad list 
390                                 int badSize = badSeqNames.size();
391                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
392                         }
393                         
394                         //close files 
395                         MPI_File_close(&inMPI);
396                         MPI_File_close(&outMPIGood);
397                         MPI_File_close(&outMPIBadAccnos);
398                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
399                                         
400 #else
401                                                 
402         //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
403                         if(processors == 1){
404                                 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);            
405                         }else{
406                                 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); 
407             }
408         //#else
409         //              numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);            
410         //#endif
411             if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
412 #endif          
413
414                 #ifdef USE_MPI
415                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
416                                         
417                         if (pid == 0) { //only one process should fix files
418                         
419                                 //read accnos file with all names in it, process 0 just has its names
420                                 MPI_File inMPIAccnos;
421                                 MPI_Offset size;
422                         
423                                 char inFileName[1024];
424                                 strcpy(inFileName, badAccnosFile.c_str());
425                         
426                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
427                                 MPI_File_get_size(inMPIAccnos, &size);
428                         
429                                 char* buffer = new char[size];
430                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
431                         
432                                 string tempBuf = buffer;
433                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
434                                 istringstream iss (tempBuf,istringstream::in);
435
436                                 delete buffer;
437                                 MPI_File_close(&inMPIAccnos);
438                                 
439                                 badSeqNames.clear();
440                                 string tempName;
441                                 while (!iss.eof()) {
442                                         iss >> tempName; m->gobble(iss);
443                                         badSeqNames.insert(tempName);
444                                 }
445                 #endif
446                                                                                                                                                                         
447                 if(namefile != "" && groupfile != "")   {       
448                         screenNameGroupFile(badSeqNames);       
449                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile); return 0; }
450                 }else if(namefile != "")        {       
451                         screenNameGroupFile(badSeqNames);
452                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile);  return 0; }   
453                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
454                 
455                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
456
457                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
458                 if(qualfile != "")                                              {       screenQual(badSeqNames);                        }
459                 if(taxonomy != "")                                              {       screenTaxonomy(badSeqNames);            }
460                 
461                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
462                 
463                 #ifdef USE_MPI
464                         }
465                 #endif
466
467                 m->mothurOutEndLine();
468                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
469                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       outputTypes["fasta"].push_back(goodSeqFile);
470                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();      outputTypes["accnos"].push_back(badAccnosFile);
471                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
472                 m->mothurOutEndLine();
473                 m->mothurOutEndLine();
474                 
475                 //set fasta file as new current fastafile
476                 string current = "";
477                 itTypes = outputTypes.find("fasta");
478                 if (itTypes != outputTypes.end()) {
479                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
480                 }
481                 
482                 itTypes = outputTypes.find("name");
483                 if (itTypes != outputTypes.end()) {
484                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
485                 }
486                 
487                 itTypes = outputTypes.find("group");
488                 if (itTypes != outputTypes.end()) {
489                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
490                 }
491                 
492                 itTypes = outputTypes.find("qfile");
493                 if (itTypes != outputTypes.end()) {
494                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
495                 }
496                 
497                 itTypes = outputTypes.find("taxonomy");
498                 if (itTypes != outputTypes.end()) {
499                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
500                 }
501
502                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
503                 m->mothurOutEndLine();
504
505                 return 0;
506         }
507         catch(exception& e) {
508                 m->errorOut(e, "ScreenSeqsCommand", "execute");
509                 exit(1);
510         }
511 }
512
513 //***************************************************************************************************************
514
515 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
516         try {
517                 ifstream inputNames;
518                 m->openInputFile(namefile, inputNames);
519                 set<string> badSeqGroups;
520                 string seqName, seqList, group;
521                 set<string>::iterator it;
522
523                 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
524                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
525                 
526                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
527                 
528                 while(!inputNames.eof()){
529                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); m->mothurRemove(goodNameFile);  return 0; }
530
531                         inputNames >> seqName >> seqList;
532                         it = badSeqNames.find(seqName);
533                                 
534                         if(it != badSeqNames.end()){
535                                 badSeqNames.erase(it);
536                                 
537                                 if(namefile != ""){
538                                         int start = 0;
539                                         for(int i=0;i<seqList.length();i++){
540                                                 if(seqList[i] == ','){
541                                                         badSeqGroups.insert(seqList.substr(start,i-start));
542                                                         start = i+1;
543                                                 }                                       
544                                         }
545                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
546                                 }
547                         }
548                         else{
549                                 goodNameOut << seqName << '\t' << seqList << endl;
550                         }
551                         m->gobble(inputNames);
552                 }
553                 inputNames.close();
554                 goodNameOut.close();
555         
556                 //we were unable to remove some of the bad sequences
557                 if (badSeqNames.size() != 0) {
558                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
559                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
560                                 m->mothurOutEndLine();
561                         }
562                 }
563
564                 if(groupfile != ""){
565                         
566                         ifstream inputGroups;
567                         m->openInputFile(groupfile, inputGroups);
568
569                         string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
570                         outputNames.push_back(goodGroupFile);   outputTypes["group"].push_back(goodGroupFile);
571                         
572                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
573                         
574                         while(!inputGroups.eof()){
575                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile);  m->mothurRemove(goodGroupFile); return 0; }
576
577                                 inputGroups >> seqName >> group;
578                                 
579                                 it = badSeqGroups.find(seqName);
580                                 
581                                 if(it != badSeqGroups.end()){
582                                         badSeqGroups.erase(it);
583                                 }
584                                 else{
585                                         goodGroupOut << seqName << '\t' << group << endl;
586                                 }
587                                 m->gobble(inputGroups);
588                         }
589                         inputGroups.close();
590                         goodGroupOut.close();
591                         
592                         //we were unable to remove some of the bad sequences
593                         if (badSeqGroups.size() != 0) {
594                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
595                                         m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
596                                         m->mothurOutEndLine();
597                                 }
598                         }
599                 }
600                 
601                 
602                 return 0;
603         
604         }
605         catch(exception& e) {
606                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
607                 exit(1);
608         }
609 }
610 //***************************************************************************************************************
611 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
612         try {
613                 
614                 vector<int> startPosition;
615                 vector<int> endPosition;
616                 vector<int> seqLength;
617                 vector<int> ambigBases;
618                 vector<int> longHomoPolymer;
619                 
620         vector<unsigned long long> positions;
621 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
622                 positions = m->divideFile(fastafile, processors);
623                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }   
624 #else
625                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
626         else {
627             int numFastaSeqs = 0;
628             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
629             
630             //figure out how many sequences you have to process
631             int numSeqsPerProcessor = numFastaSeqs / processors;
632             for (int i = 0; i < processors; i++) {
633                 int startIndex =  i * numSeqsPerProcessor;
634                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
635                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
636             }
637         }
638 #endif
639                 
640 #ifdef USE_MPI
641                 int pid;
642                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
643                 
644                 if (pid == 0) { 
645                         driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
646 #else
647                 int numSeqs = 0;
648                 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
649                         if(processors == 1){
650                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
651                         }else{
652                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile); 
653                         }
654                                 
655                         if (m->control_pressed) {  return 0; }
656                 //#else
657                 //      numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
658                 //      if (m->control_pressed) {  return 0; }
659                 //#endif
660 #endif
661                 sort(startPosition.begin(), startPosition.end());
662                 sort(endPosition.begin(), endPosition.end());
663                 sort(seqLength.begin(), seqLength.end());
664                 sort(ambigBases.begin(), ambigBases.end());
665                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
666                 
667                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
668                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
669                 
670                 for (int i = 0; i < optimize.size(); i++) {
671                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
672                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
673                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
674                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
675                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
676                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
677                 }
678
679 #ifdef USE_MPI
680                 }
681                 
682                 MPI_Status status; 
683                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
684                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
685                         
686                 if (pid == 0) { 
687                         //send file positions to all processes
688                         for(int i = 1; i < processors; i++) { 
689                                 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
690                                 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
691                                 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
692                                 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
693                                 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
694                                 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
695                         }
696                 }else {
697                         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
698                         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
699                         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
700                         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
701                         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
702                         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
703                 }
704                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
705 #endif
706                 return 0;
707         }
708         catch(exception& e) {
709                 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
710                 exit(1);
711         }
712 }
713 /**************************************************************************************/
714 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {    
715         try {
716                 
717                 ifstream in;
718                 m->openInputFile(filename, in);
719                                 
720                 in.seekg(filePos.start);
721
722                 bool done = false;
723                 int count = 0;
724         
725                 while (!done) {
726                                 
727                         if (m->control_pressed) { in.close(); return 1; }
728                                         
729                         Sequence current(in); m->gobble(in);
730         
731                         if (current.getName() != "") {
732                                 int num = 1;
733                                 if (namefile != "") {
734                                         //make sure this sequence is in the namefile, else error 
735                                         map<string, int>::iterator it = nameMap.find(current.getName());
736                                         
737                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
738                                         else { num = it->second; }
739                                 }
740                                 
741                                 //for each sequence this sequence represents
742                                 for (int i = 0; i < num; i++) {
743                                         startPosition.push_back(current.getStartPos());
744                                         endPosition.push_back(current.getEndPos());
745                                         seqLength.push_back(current.getNumBases());
746                                         ambigBases.push_back(current.getAmbigBases());
747                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
748                                 }
749                                 
750                                 count++;
751                         }
752                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
753                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
754                                 unsigned long long pos = in.tellg();
755                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
756                         #else
757                                 if (in.eof()) { break; }
758                         #endif
759                         
760                 }
761                 
762                 in.close();
763                 
764                 return count;
765         }
766         catch(exception& e) {
767                 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
768                 exit(1);
769         }
770 }
771 /**************************************************************************************************/
772 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
773         try {
774         
775         int process = 1;
776                 int num = 0;
777                 vector<int> processIDS;
778
779 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
780                                 
781                 //loop through and create all the processes you want
782                 while (process != processors) {
783                         int pid = fork();
784                         
785                         if (pid > 0) {
786                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
787                                 process++;
788                         }else if (pid == 0){
789                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
790                                 
791                                 //pass numSeqs to parent
792                                 ofstream out;
793                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
794                                 m->openOutputFile(tempFile, out);
795                                 
796                                 out << num << endl;
797                                 out << startPosition.size() << endl;
798                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
799                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
800                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
801                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
802                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
803                                 
804                                 out.close();
805                                 
806                                 exit(0);
807                         }else { 
808                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
809                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
810                                 exit(0);
811                         }
812                 }
813                 
814                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
815                 
816                 //force parent to wait until all the processes are done
817                 for (int i=0;i<processIDS.size();i++) { 
818                         int temp = processIDS[i];
819                         wait(&temp);
820                 }
821                 
822                 //parent reads in and combine Filter info
823                 for (int i = 0; i < processIDS.size(); i++) {
824                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
825                         ifstream in;
826                         m->openInputFile(tempFilename, in);
827                         
828                         int temp, tempNum;
829                         in >> tempNum; m->gobble(in); num += tempNum;
830                         in >> tempNum; m->gobble(in);
831                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
832                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
833                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
834                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
835                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
836                                 
837                         in.close();
838                         m->mothurRemove(tempFilename);
839                 }
840                 
841                 
842 #else 
843         //////////////////////////////////////////////////////////////////////////////////////////////////////
844                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
845                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
846                 //Taking advantage of shared memory to allow both threads to add info to vectors.
847                 //////////////////////////////////////////////////////////////////////////////////////////////////////
848                 
849                 vector<sumData*> pDataArray; 
850                 DWORD   dwThreadIdArray[processors-1];
851                 HANDLE  hThreadArray[processors-1]; 
852                 
853                 //Create processor worker threads.
854                 for( int i=0; i<processors-1; i++ ){
855             
856                         // Allocate memory for thread data.
857                         sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
858                         pDataArray.push_back(tempSum);
859                         
860                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
861                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
862                         hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
863                 }
864                 
865         //do your part
866                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
867          
868                 //Wait until all threads have terminated.
869                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
870                 
871                 //Close all thread handles and free memory allocations.
872                 for(int i=0; i < pDataArray.size(); i++){
873                         num += pDataArray[i]->count;
874             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
875                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
876             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
877             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
878             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
879                         CloseHandle(hThreadArray[i]);
880                         delete pDataArray[i];
881                 }
882
883 #endif          
884         return num;
885         }
886         catch(exception& e) {
887                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
888                 exit(1);
889         }
890 }
891
892 //***************************************************************************************************************
893
894 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
895         try {
896                 ifstream inputGroups;
897                 m->openInputFile(groupfile, inputGroups);
898                 string seqName, group;
899                 set<string>::iterator it;
900                 
901                 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
902                 outputNames.push_back(goodGroupFile);  outputTypes["group"].push_back(goodGroupFile);
903                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
904                 
905                 while(!inputGroups.eof()){
906                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
907
908                         inputGroups >> seqName >> group;
909                         it = badSeqNames.find(seqName);
910                         
911                         if(it != badSeqNames.end()){
912                                 badSeqNames.erase(it);
913                         }
914                         else{
915                                 goodGroupOut << seqName << '\t' << group << endl;
916                         }
917                         m->gobble(inputGroups);
918                 }
919                 
920                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); m->mothurRemove(goodGroupFile);  return 0; }
921
922                 //we were unable to remove some of the bad sequences
923                 if (badSeqNames.size() != 0) {
924                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
925                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
926                                 m->mothurOutEndLine();
927                         }
928                 }
929                 
930                 inputGroups.close();
931                 goodGroupOut.close();
932                 
933                 if (m->control_pressed) { m->mothurRemove(goodGroupFile);   }
934                 
935                 return 0;
936         
937         }
938         catch(exception& e) {
939                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
940                 exit(1);
941         }
942 }
943
944 //***************************************************************************************************************
945
946 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
947         try {
948                 ifstream inputAlignReport;
949                 m->openInputFile(alignreport, inputAlignReport);
950                 string seqName, group;
951                 set<string>::iterator it;
952                 
953                 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
954                 outputNames.push_back(goodAlignReportFile);  outputTypes["alignreport"].push_back(goodAlignReportFile);
955                 ofstream goodAlignReportOut;    m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
956
957                 while (!inputAlignReport.eof()) {               //      need to copy header
958                         char c = inputAlignReport.get();
959                         goodAlignReportOut << c;
960                         if (c == 10 || c == 13){        break;  }       
961                 }
962
963                 while(!inputAlignReport.eof()){
964                         if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
965
966                         inputAlignReport >> seqName;
967                         it = badSeqNames.find(seqName);
968                         string line;            
969                         while (!inputAlignReport.eof()) {               //      need to copy header
970                                 char c = inputAlignReport.get();
971                                 line += c;
972                                 if (c == 10 || c == 13){        break;  }       
973                         }
974                         
975                         if(it != badSeqNames.end()){
976                                 badSeqNames.erase(it);
977                         }
978                         else{
979                                 goodAlignReportOut << seqName << '\t' << line;
980                         }
981                         m->gobble(inputAlignReport);
982                 }
983                 
984                 if (m->control_pressed) { goodAlignReportOut.close();  inputAlignReport.close(); m->mothurRemove(goodAlignReportFile);  return 0; }
985
986                 //we were unable to remove some of the bad sequences
987                 if (badSeqNames.size() != 0) {
988                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
989                                 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct."); 
990                                 m->mothurOutEndLine();
991                         }
992                 }
993
994                 inputAlignReport.close();
995                 goodAlignReportOut.close();
996                                 
997                 if (m->control_pressed) {  m->mothurRemove(goodAlignReportFile);  return 0; }
998                 
999                 return 0;
1000         
1001         }
1002         catch(exception& e) {
1003                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
1004                 exit(1);
1005         }
1006         
1007 }
1008 //***************************************************************************************************************
1009
1010 int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
1011         try {
1012                 ifstream input;
1013                 m->openInputFile(taxonomy, input);
1014                 string seqName, tax;
1015                 set<string>::iterator it;
1016                 
1017                 string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + "good" + m->getExtension(taxonomy);
1018                 outputNames.push_back(goodTaxFile);  outputTypes["taxonomy"].push_back(goodTaxFile);
1019                 ofstream goodTaxOut;    m->openOutputFile(goodTaxFile, goodTaxOut);
1020                                 
1021                 while(!input.eof()){
1022                         if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1023                         
1024                         input >> seqName >> tax;
1025                         it = badSeqNames.find(seqName);
1026                         
1027                         if(it != badSeqNames.end()){ badSeqNames.erase(it); }
1028                         else{
1029                                 goodTaxOut << seqName << '\t' << tax << endl;
1030                         }
1031                         m->gobble(input);
1032                 }
1033                 
1034                 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1035                 
1036                 //we were unable to remove some of the bad sequences
1037                 if (badSeqNames.size() != 0) {
1038                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1039                                 m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct."); 
1040                                 m->mothurOutEndLine();
1041                         }
1042                 }
1043                 
1044                 input.close();
1045                 goodTaxOut.close();
1046                 
1047                 if (m->control_pressed) {  m->mothurRemove(goodTaxFile);  return 0; }
1048                 
1049                 return 0;
1050                 
1051         }
1052         catch(exception& e) {
1053                 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
1054                 exit(1);
1055         }
1056         
1057 }
1058 //***************************************************************************************************************
1059
1060 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
1061         try {
1062                 ifstream in;
1063                 m->openInputFile(qualfile, in);
1064                 set<string>::iterator it;
1065                 
1066                 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
1067                 outputNames.push_back(goodQualFile);  outputTypes["qfile"].push_back(goodQualFile);
1068                 ofstream goodQual;      m->openOutputFile(goodQualFile, goodQual);
1069                 
1070                 while(!in.eof()){       
1071                         
1072                         if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
1073
1074                         string saveName = "";
1075                         string name = "";
1076                         string scores = "";
1077                         
1078                         in >> name; 
1079                         
1080                         if (name.length() != 0) { 
1081                                 saveName = name.substr(1);
1082                                 while (!in.eof())       {       
1083                                         char c = in.get(); 
1084                                         if (c == 10 || c == 13){        break;  }
1085                                         else { name += c; }     
1086                                 } 
1087                                 m->gobble(in);
1088                         }
1089                         
1090                         while(in){
1091                                 char letter= in.get();
1092                                 if(letter == '>'){      in.putback(letter);     break;  }
1093                                 else{ scores += letter; }
1094                         }
1095                         
1096                         m->gobble(in);
1097                         
1098                         it = badSeqNames.find(saveName);
1099                         
1100                         if(it != badSeqNames.end()){
1101                                 badSeqNames.erase(it);
1102                         }else{                          
1103                                 goodQual << name << endl << scores;
1104                         }
1105                         
1106                         m->gobble(in);
1107                 }
1108                 
1109                 in.close();
1110                 goodQual.close();
1111                 
1112                 //we were unable to remove some of the bad sequences
1113                 if (badSeqNames.size() != 0) {
1114                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1115                                 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct."); 
1116                                 m->mothurOutEndLine();
1117                         }
1118                 }
1119                 
1120                 if (m->control_pressed) {  m->mothurRemove(goodQualFile);  return 0; }
1121                 
1122                 return 0;
1123                 
1124         }
1125         catch(exception& e) {
1126                 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1127                 exit(1);
1128         }
1129         
1130 }
1131 //**********************************************************************************************************************
1132
1133 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1134         try {
1135                 ofstream goodFile;
1136                 m->openOutputFile(goodFName, goodFile);
1137                 
1138                 ofstream badAccnosFile;
1139                 m->openOutputFile(badAccnosFName, badAccnosFile);
1140                 
1141                 ifstream inFASTA;
1142                 m->openInputFile(filename, inFASTA);
1143
1144                 inFASTA.seekg(filePos.start);
1145
1146                 bool done = false;
1147                 int count = 0;
1148         
1149                 while (!done) {
1150                 
1151                         if (m->control_pressed) {  return 0; }
1152                         
1153                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
1154                         if (currSeq.getName() != "") {
1155                                 bool goodSeq = 1;               //      innocent until proven guilty
1156                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1157                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1158                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1159                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1160                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1161                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1162                                 
1163                                 if(goodSeq == 1){
1164                                         currSeq.printSequence(goodFile);        
1165                                 }
1166                                 else{
1167                                         badAccnosFile << currSeq.getName() << endl;
1168                                         badSeqNames.insert(currSeq.getName());
1169                                 }
1170                         count++;
1171                         }
1172                         
1173                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1174                                 unsigned long long pos = inFASTA.tellg();
1175                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
1176                         #else
1177                                 if (inFASTA.eof()) { break; }
1178                         #endif
1179                         
1180                         //report progress
1181                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1182                 }
1183                 //report progress
1184                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1185                 
1186                         
1187                 goodFile.close();
1188                 inFASTA.close();
1189                 badAccnosFile.close();
1190                 
1191                 return count;
1192         }
1193         catch(exception& e) {
1194                 m->errorOut(e, "ScreenSeqsCommand", "driver");
1195                 exit(1);
1196         }
1197 }
1198 //**********************************************************************************************************************
1199 #ifdef USE_MPI
1200 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
1201         try {
1202                 string outputString = "";
1203                 MPI_Status statusGood; 
1204                 MPI_Status statusBadAccnos; 
1205                 MPI_Status status; 
1206                 int pid;
1207                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1208
1209                 for(int i=0;i<num;i++){
1210                 
1211                         if (m->control_pressed) {  return 0; }
1212                         
1213                         //read next sequence
1214                         int length = MPIPos[start+i+1] - MPIPos[start+i];
1215
1216                         char* buf4 = new char[length];
1217                         memcpy(buf4, outputString.c_str(), length);
1218
1219                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1220                         
1221                         string tempBuf = buf4;  delete buf4;
1222                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
1223                         istringstream iss (tempBuf,istringstream::in);
1224                         
1225                         Sequence currSeq(iss);                  
1226                         
1227                         //process seq
1228                         if (currSeq.getName() != "") {
1229                                 bool goodSeq = 1;               //      innocent until proven guilty
1230                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1231                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1232                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1233                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1234                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1235                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1236                                 
1237                                 if(goodSeq == 1){
1238                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1239                                 
1240                                         //print good seq
1241                                         length = outputString.length();
1242                                         char* buf2 = new char[length];
1243                                         memcpy(buf2, outputString.c_str(), length);
1244                                         
1245                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1246                                         delete buf2;
1247                                 }
1248                                 else{
1249
1250                                         badSeqNames.insert(currSeq.getName());
1251                                         
1252                                         //write to bad accnos file
1253                                         outputString = currSeq.getName() + "\n";
1254                                 
1255                                         length = outputString.length();
1256                                         char* buf3 = new char[length];
1257                                         memcpy(buf3, outputString.c_str(), length);
1258                                         
1259                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1260                                         delete buf3;
1261                                 }
1262                         }
1263                         
1264                         //report progress
1265                         if((i) % 100 == 0){     m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine();             }
1266                 }
1267                                 
1268                 return 1;
1269         }
1270         catch(exception& e) {
1271                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1272                 exit(1);
1273         }
1274 }
1275 #endif
1276 /**************************************************************************************************/
1277
1278 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1279         try {
1280         
1281         vector<int> processIDS;   
1282         int process = 1;
1283                 int num = 0;
1284
1285 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1286                                 
1287                 //loop through and create all the processes you want
1288                 while (process != processors) {
1289                         int pid = fork();
1290                         
1291                         if (pid > 0) {
1292                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1293                                 process++;
1294                         }else if (pid == 0){
1295                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1296                                 
1297                                 //pass numSeqs to parent
1298                                 ofstream out;
1299                                 string tempFile = filename + toString(getpid()) + ".num.temp";
1300                                 m->openOutputFile(tempFile, out);
1301                                 out << num << endl;
1302                                 out.close();
1303                                 
1304                                 exit(0);
1305                         }else { 
1306                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1307                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1308                                 exit(0);
1309                         }
1310                 }
1311                 
1312         num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
1313         
1314                 //force parent to wait until all the processes are done
1315                 for (int i=0;i<processIDS.size();i++) { 
1316                         int temp = processIDS[i];
1317                         wait(&temp);
1318                 }
1319                 
1320                 for (int i = 0; i < processIDS.size(); i++) {
1321                         ifstream in;
1322                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
1323                         m->openInputFile(tempFile, in);
1324                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1325                         in.close(); m->mothurRemove(tempFile);
1326             
1327             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1328             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1329                         
1330             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1331             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1332                 }
1333                 
1334         //read badSeqs in because root process doesnt know what other "bad" seqs the children found
1335         ifstream inBad;
1336         int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
1337         
1338         if (ableToOpen == 0) {
1339             badSeqNames.clear();
1340             string tempName;
1341             while (!inBad.eof()) {
1342                 inBad >> tempName; m->gobble(inBad);
1343                 badSeqNames.insert(tempName);
1344             }
1345             inBad.close();
1346         }
1347 #else
1348         
1349         //////////////////////////////////////////////////////////////////////////////////////////////////////
1350                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
1351                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1352                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
1353                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1354                 
1355                 vector<sumScreenData*> pDataArray; 
1356                 DWORD   dwThreadIdArray[processors-1];
1357                 HANDLE  hThreadArray[processors-1]; 
1358                 
1359                 //Create processor worker threads.
1360                 for( int i=0; i<processors-1; i++ ){
1361             
1362             string extension = "";
1363             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
1364             
1365                         // Allocate memory for thread data.
1366                         sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension, &badSeqNames);
1367                         pDataArray.push_back(tempSum);
1368                         
1369                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1370                         hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1371                 }
1372                 
1373         //do your part
1374         num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
1375         processIDS.push_back(processors-1);
1376         
1377                 //Wait until all threads have terminated.
1378                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1379                 
1380                 //Close all thread handles and free memory allocations.
1381                 for(int i=0; i < pDataArray.size(); i++){
1382                         num += pDataArray[i]->count;
1383                         CloseHandle(hThreadArray[i]);
1384                         delete pDataArray[i];
1385                 }
1386         
1387         for (int i = 0; i < processIDS.size(); i++) {
1388             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
1389             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
1390                         
1391             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
1392             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
1393                 }
1394
1395 #endif  
1396         
1397         return num;
1398         
1399         }
1400         catch(exception& e) {
1401                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1402                 exit(1);
1403         }
1404 }
1405
1406 //***************************************************************************************************************
1407
1408