]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
fixed bug with shhh.flow from file path name in write functions, added "smart" featur...
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
21                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
22                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
23                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
24                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25                 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxhomop);
26                 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pminlength);
27                 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxlength);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29                 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "",false,false); parameters.push_back(pcriteria);
30                 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "",true,false); parameters.push_back(poptimize);
31                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string ScreenSeqsCommand::getHelpString(){      
45         try {
46                 string helpString = "";
47                 helpString += "The screen.seqs command reads a fastafile and creates .....\n";
48                 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
49                 helpString += "The fasta parameter is required.\n";
50                 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
51                 helpString += "The start parameter .... The default is -1.\n";
52                 helpString += "The end parameter .... The default is -1.\n";
53                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
54                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
55                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
56                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
57                 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
58                 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
59                 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
60                 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
61                 helpString += "The screen.seqs command should be in the following format: \n";
62                 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n";
63                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
64                 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
65                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
66                 return helpString;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74 ScreenSeqsCommand::ScreenSeqsCommand(){ 
75         try {
76                 abort = true; calledHelp = true; 
77                 setParameters();
78                 vector<string> tempOutNames;
79                 outputTypes["fasta"] = tempOutNames;
80                 outputTypes["name"] = tempOutNames;
81                 outputTypes["group"] = tempOutNames;
82                 outputTypes["alignreport"] = tempOutNames;
83                 outputTypes["accnos"] = tempOutNames;
84                 outputTypes["qfile"] = tempOutNames;
85                 outputTypes["taxonomy"] = tempOutNames;
86         }
87         catch(exception& e) {
88                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
89                 exit(1);
90         }
91 }
92 //***************************************************************************************************************
93
94 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
95         try {
96                 abort = false; calledHelp = false;   
97                 
98                 //allow user to run help
99                 if(option == "help") { help(); abort = true; calledHelp = true; }
100                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
101                 
102                 else {
103                         vector<string> myArray = setParameters();
104                         
105                         OptionParser parser(option);
106                         map<string,string> parameters = parser.getParameters();
107                         
108                         ValidParameters validParameter("screen.seqs");
109                         map<string,string>::iterator it;
110                         
111                         //check to make sure all parameters are valid for command
112                         for (it = parameters.begin(); it != parameters.end(); it++) { 
113                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
114                         }
115                         
116                         //initialize outputTypes
117                         vector<string> tempOutNames;
118                         outputTypes["fasta"] = tempOutNames;
119                         outputTypes["name"] = tempOutNames;
120                         outputTypes["group"] = tempOutNames;
121                         outputTypes["alignreport"] = tempOutNames;
122                         outputTypes["accnos"] = tempOutNames;
123                         outputTypes["qfile"] = tempOutNames;
124                         outputTypes["taxonomy"] = tempOutNames;
125                         
126                         //if the user changes the input directory command factory will send this info to us in the output parameter 
127                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
128                         if (inputDir == "not found"){   inputDir = "";          }
129                         else {
130                                 string path;
131                                 it = parameters.find("fasta");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
137                                 }
138                                 
139                                 it = parameters.find("group");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
145                                 }
146                                 
147                                 it = parameters.find("name");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
153                                 }
154                                 
155                                 it = parameters.find("alignreport");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
161                                 }
162                                 
163                                 it = parameters.find("qfile");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
169                                 }
170                                 
171                                 it = parameters.find("taxonomy");
172                                 //user has given a template file
173                                 if(it != parameters.end()){ 
174                                         path = m->hasPath(it->second);
175                                         //if the user has not given a path then, add inputdir. else leave path alone.
176                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
177                                 }
178                         }
179
180                         //check for required parameters
181                         fastafile = validParameter.validFile(parameters, "fasta", true);
182                         if (fastafile == "not found") {                         
183                                 fastafile = m->getFastaFile(); 
184                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
185                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
186                         }
187                         else if (fastafile == "not open") { abort = true; }
188                         else { m->setFastaFile(fastafile); }
189         
190                         groupfile = validParameter.validFile(parameters, "group", true);
191                         if (groupfile == "not open") { abort = true; }  
192                         else if (groupfile == "not found") { groupfile = ""; }
193                         else { m->setGroupFile(groupfile); }
194                         
195                         qualfile = validParameter.validFile(parameters, "qfile", true);
196                         if (qualfile == "not open") { abort = true; }   
197                         else if (qualfile == "not found") { qualfile = ""; }
198                         else { m->setQualFile(qualfile); }
199                         
200                         namefile = validParameter.validFile(parameters, "name", true);
201                         if (namefile == "not open") { namefile = ""; abort = true; }
202                         else if (namefile == "not found") { namefile = ""; }    
203                         else { m->setNameFile(namefile); }
204                         
205                         alignreport = validParameter.validFile(parameters, "alignreport", true);
206                         if (alignreport == "not open") { abort = true; }
207                         else if (alignreport == "not found") { alignreport = ""; }
208                         
209                         taxonomy = validParameter.validFile(parameters, "taxonomy", true);
210                         if (taxonomy == "not open") { abort = true; }
211                         else if (taxonomy == "not found") { taxonomy = ""; }    
212                         
213                         //if the user changes the output directory command factory will send this info to us in the output parameter 
214                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
215                                 outputDir = ""; 
216                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
217                         }
218
219                         //check for optional parameter and set defaults
220                         // ...at some point should added some additional type checking...
221                         string temp;
222                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
223                         m->mothurConvert(temp, startPos); 
224                 
225                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
226                         m->mothurConvert(temp, endPos);  
227
228                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
229                         m->mothurConvert(temp, maxAmbig);  
230
231                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
232                         m->mothurConvert(temp, maxHomoP);  
233
234                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
235                         m->mothurConvert(temp, minLength); 
236                         
237                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
238                         m->mothurConvert(temp, maxLength); 
239                         
240                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
241                         m->setProcessors(temp);
242                         m->mothurConvert(temp, processors);
243                         
244                         temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
245                         if (temp == "not found"){       temp = "none";          }
246                         m->splitAtDash(temp, optimize);         
247                         
248                         //check for invalid optimize options
249                         set<string> validOptimizers;
250                         validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength");
251                         for (int i = 0; i < optimize.size(); i++) { 
252                                 if (validOptimizers.count(optimize[i]) == 0) { 
253                                         m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine();
254                                         optimize.erase(optimize.begin()+i);
255                                         i--;
256                                 }
257                         }
258                         
259                         if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
260                         
261                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
262                         m->mothurConvert(temp, criteria); 
263                         
264                         if (namefile == "") {
265                                 vector<string> files; files.push_back(fastafile);
266                                 parser.getNameFile(files);
267                         }
268                 }
269
270         }
271         catch(exception& e) {
272                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
273                 exit(1);
274         }
275 }
276 //***************************************************************************************************************
277
278 int ScreenSeqsCommand::execute(){
279         try{
280                 
281                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
282                 
283                 //if the user want to optimize we need to know the 90% mark
284                 vector<unsigned long long> positions;
285                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
286                         //use the namefile to optimize correctly
287                         if (namefile != "") { nameMap = m->readNames(namefile); }
288                         getSummary(positions); 
289                 } 
290                 else { 
291                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
292                         positions = m->divideFile(fastafile, processors);
293                         for (int i = 0; i < (positions.size()-1); i++) {
294                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
295                         }
296                         #else 
297                                 positions.push_back(0); positions.push_back(1000);
298                                 lines.push_back(new linePair(0, 1000));
299                         #endif
300                 }
301                                 
302                 string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
303                 string badAccnosFile =  outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
304                 
305                 int numFastaSeqs = 0;
306                 set<string> badSeqNames;
307                 int start = time(NULL);
308         
309 #ifdef USE_MPI  
310                         int pid, numSeqsPerProcessor; 
311                         int tag = 2001;
312                         vector<unsigned long long> MPIPos;
313                         
314                         MPI_Status status; 
315                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
316                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
317         
318                         MPI_File inMPI;
319                         MPI_File outMPIGood;
320                         MPI_File outMPIBadAccnos;
321                         
322                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
323                         int inMode=MPI_MODE_RDONLY; 
324                         
325                         char outGoodFilename[1024];
326                         strcpy(outGoodFilename, goodSeqFile.c_str());
327
328                         char outBadAccnosFilename[1024];
329                         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
330
331                         char inFileName[1024];
332                         strcpy(inFileName, fastafile.c_str());
333                         
334                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
335                         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
336                         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
337                         
338                         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
339                         
340                         if (pid == 0) { //you are the root process 
341                                 
342                                 MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
343                                 
344                                 //send file positions to all processes
345                                 for(int i = 1; i < processors; i++) { 
346                                         MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
347                                         MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
348                                 }
349                                 
350                                 //figure out how many sequences you have to align
351                                 numSeqsPerProcessor = numFastaSeqs / processors;
352                                 int startIndex =  pid * numSeqsPerProcessor;
353                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
354                 //      cout << pid << '\t' << numSeqsPerProcessor << '\t' <<   startIndex << endl;
355                                 //align your part
356                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
357                 //cout << pid << " done" << endl;
358                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
359
360                                 for (int i = 1; i < processors; i++) {
361                                 
362                                         //get bad lists
363                                         int badSize;
364                                         MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
365                                         /*for (int j = 0; j < badSize; j++) {
366                                                 int length;
367                                                 MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);  //recv the length of the name
368                                                 char* buf2 = new char[length];                                                                          //make space to recieve it
369                                                 MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);  //get name
370                                                 
371                                                 string tempBuf = buf2;
372                                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
373                                                 delete buf2;
374                                                 
375                                                 badSeqNames.insert(tempBuf);
376                                         }*/
377                                 }
378                         }else{ //you are a child process
379                                 MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
380                                 MPIPos.resize(numFastaSeqs+1);
381                                 MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
382
383                                 //figure out how many sequences you have to align
384                                 numSeqsPerProcessor = numFastaSeqs / processors;
385                                 int startIndex =  pid * numSeqsPerProcessor;
386                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
387                 //cout << pid << '\t' << numSeqsPerProcessor << '\t' <<         startIndex << endl;             
388                                 //align your part
389                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
390 //cout << pid << " done" << endl;
391                                 if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
392                                 
393                                 //send bad list 
394                                 int badSize = badSeqNames.size();
395                                 MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
396                                 
397                                 /*
398                                 set<string>::iterator it;
399                                 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
400                                         string name = *it;
401                                         int length = name.length();
402                                         char* buf2 = new char[length];
403                                         memcpy(buf2, name.c_str(), length);
404                                         
405                                         MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
406                                         MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
407                                 }*/
408                         }
409                         
410                         //close files 
411                         MPI_File_close(&inMPI);
412                         MPI_File_close(&outMPIGood);
413                         MPI_File_close(&outMPIBadAccnos);
414                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
415                                         
416 #else
417                                                 
418         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
419                         if(processors == 1){
420                                 numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
421                                 
422                                 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
423                                 
424                         }else{
425                                 processIDS.resize(0);
426                                 
427                                 numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); 
428                                 
429                                 rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
430                                 rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
431                                 
432                                 //append alignment and report files
433                                 for(int i=1;i<processors;i++){
434                                         m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
435                                         m->mothurRemove((goodSeqFile + toString(processIDS[i]) + ".temp"));
436                         
437                                         m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
438                                         m->mothurRemove((badAccnosFile + toString(processIDS[i]) + ".temp"));
439                                 }
440                                 
441                                 if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
442                                 
443                                 //read badSeqs in because root process doesnt know what other "bad" seqs the children found
444                                 ifstream inBad;
445                                 int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
446                                 
447                                 if (ableToOpen == 0) {
448                                         badSeqNames.clear();
449                                         string tempName;
450                                         while (!inBad.eof()) {
451                                                 inBad >> tempName; m->gobble(inBad);
452                                                 badSeqNames.insert(tempName);
453                                         }
454                                         inBad.close();
455                                 }
456                         }
457         #else
458                         numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
459                         
460                         if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
461                         
462         #endif
463
464 #endif          
465
466                 #ifdef USE_MPI
467                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
468                                         
469                         if (pid == 0) { //only one process should fix files
470                         
471                                 //read accnos file with all names in it, process 0 just has its names
472                                 MPI_File inMPIAccnos;
473                                 MPI_Offset size;
474                         
475                                 char inFileName[1024];
476                                 strcpy(inFileName, badAccnosFile.c_str());
477                         
478                                 MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
479                                 MPI_File_get_size(inMPIAccnos, &size);
480                         
481                                 char* buffer = new char[size];
482                                 MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
483                         
484                                 string tempBuf = buffer;
485                                 if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
486                                 istringstream iss (tempBuf,istringstream::in);
487
488                                 delete buffer;
489                                 MPI_File_close(&inMPIAccnos);
490                                 
491                                 badSeqNames.clear();
492                                 string tempName;
493                                 while (!iss.eof()) {
494                                         iss >> tempName; m->gobble(iss);
495                                         badSeqNames.insert(tempName);
496                                 }
497                 #endif
498                                                                                                                                                                         
499                 if(namefile != "" && groupfile != "")   {       
500                         screenNameGroupFile(badSeqNames);       
501                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile); return 0; }
502                 }else if(namefile != "")        {       
503                         screenNameGroupFile(badSeqNames);
504                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile);  return 0; }   
505                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
506                 
507                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
508
509                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
510                 if(qualfile != "")                                              {       screenQual(badSeqNames);                        }
511                 if(taxonomy != "")                                              {       screenTaxonomy(badSeqNames);            }
512                 
513                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
514                 
515                 #ifdef USE_MPI
516                         }
517                 #endif
518
519                 m->mothurOutEndLine();
520                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
521                 m->mothurOut(goodSeqFile); m->mothurOutEndLine();       outputTypes["fasta"].push_back(goodSeqFile);
522                 m->mothurOut(badAccnosFile); m->mothurOutEndLine();      outputTypes["accnos"].push_back(badAccnosFile);
523                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
524                 m->mothurOutEndLine();
525                 m->mothurOutEndLine();
526                 
527                 //set fasta file as new current fastafile
528                 string current = "";
529                 itTypes = outputTypes.find("fasta");
530                 if (itTypes != outputTypes.end()) {
531                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
532                 }
533                 
534                 itTypes = outputTypes.find("name");
535                 if (itTypes != outputTypes.end()) {
536                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
537                 }
538                 
539                 itTypes = outputTypes.find("group");
540                 if (itTypes != outputTypes.end()) {
541                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
542                 }
543                 
544                 itTypes = outputTypes.find("qfile");
545                 if (itTypes != outputTypes.end()) {
546                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
547                 }
548                 
549                 itTypes = outputTypes.find("taxonomy");
550                 if (itTypes != outputTypes.end()) {
551                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
552                 }
553
554                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
555                 m->mothurOutEndLine();
556
557                 return 0;
558         }
559         catch(exception& e) {
560                 m->errorOut(e, "ScreenSeqsCommand", "execute");
561                 exit(1);
562         }
563 }
564
565 //***************************************************************************************************************
566
567 int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
568         try {
569                 ifstream inputNames;
570                 m->openInputFile(namefile, inputNames);
571                 set<string> badSeqGroups;
572                 string seqName, seqList, group;
573                 set<string>::iterator it;
574
575                 string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile);
576                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
577                 
578                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
579                 
580                 while(!inputNames.eof()){
581                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); m->mothurRemove(goodNameFile);  return 0; }
582
583                         inputNames >> seqName >> seqList;
584                         it = badSeqNames.find(seqName);
585                                 
586                         if(it != badSeqNames.end()){
587                                 badSeqNames.erase(it);
588                                 
589                                 if(namefile != ""){
590                                         int start = 0;
591                                         for(int i=0;i<seqList.length();i++){
592                                                 if(seqList[i] == ','){
593                                                         badSeqGroups.insert(seqList.substr(start,i-start));
594                                                         start = i+1;
595                                                 }                                       
596                                         }
597                                         badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
598                                 }
599                         }
600                         else{
601                                 goodNameOut << seqName << '\t' << seqList << endl;
602                         }
603                         m->gobble(inputNames);
604                 }
605                 inputNames.close();
606                 goodNameOut.close();
607         
608                 //we were unable to remove some of the bad sequences
609                 if (badSeqNames.size() != 0) {
610                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
611                                 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
612                                 m->mothurOutEndLine();
613                         }
614                 }
615
616                 if(groupfile != ""){
617                         
618                         ifstream inputGroups;
619                         m->openInputFile(groupfile, inputGroups);
620
621                         string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
622                         outputNames.push_back(goodGroupFile);   outputTypes["group"].push_back(goodGroupFile);
623                         
624                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
625                         
626                         while(!inputGroups.eof()){
627                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile);  m->mothurRemove(goodGroupFile); return 0; }
628
629                                 inputGroups >> seqName >> group;
630                                 
631                                 it = badSeqGroups.find(seqName);
632                                 
633                                 if(it != badSeqGroups.end()){
634                                         badSeqGroups.erase(it);
635                                 }
636                                 else{
637                                         goodGroupOut << seqName << '\t' << group << endl;
638                                 }
639                                 m->gobble(inputGroups);
640                         }
641                         inputGroups.close();
642                         goodGroupOut.close();
643                         
644                         //we were unable to remove some of the bad sequences
645                         if (badSeqGroups.size() != 0) {
646                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
647                                         m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
648                                         m->mothurOutEndLine();
649                                 }
650                         }
651                 }
652                 
653                 
654                 return 0;
655         
656         }
657         catch(exception& e) {
658                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
659                 exit(1);
660         }
661 }
662 //***************************************************************************************************************
663 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
664         try {
665                 
666                 vector<int> startPosition;
667                 vector<int> endPosition;
668                 vector<int> seqLength;
669                 vector<int> ambigBases;
670                 vector<int> longHomoPolymer;
671                 
672 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
673                 vector<unsigned long long> positions = m->divideFile(fastafile, processors);
674                 
675                 for (int i = 0; i < (positions.size()-1); i++) {
676                         lines.push_back(new linePair(positions[i], positions[(i+1)]));
677                 }       
678 #else
679                 lines.push_back(new linePair(0, 1000));
680 #endif
681                 
682 #ifdef USE_MPI
683                 int pid;
684                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
685                 
686                 if (pid == 0) { 
687                         driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
688 #else
689                 int numSeqs = 0;
690                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
691                         if(processors == 1){
692                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
693                         }else{
694                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile); 
695                         }
696                                 
697                         if (m->control_pressed) {  return 0; }
698                 #else
699                         numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
700                         if (m->control_pressed) {  return 0; }
701                 #endif
702 #endif
703                 sort(startPosition.begin(), startPosition.end());
704                 sort(endPosition.begin(), endPosition.end());
705                 sort(seqLength.begin(), seqLength.end());
706                 sort(ambigBases.begin(), ambigBases.end());
707                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
708                 
709                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
710                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
711                 
712                 for (int i = 0; i < optimize.size(); i++) {
713                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
714                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
715                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
716                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
717                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
718                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
719                 }
720
721 #ifdef USE_MPI
722                 }
723                 
724                 MPI_Status status; 
725                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
726                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
727                         
728                 if (pid == 0) { 
729                         //send file positions to all processes
730                         for(int i = 1; i < processors; i++) { 
731                                 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
732                                 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
733                                 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
734                                 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
735                                 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
736                                 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
737                         }
738                 }else {
739                         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
740                         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
741                         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
742                         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
743                         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
744                         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
745                 }
746                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
747 #endif
748                 return 0;
749         }
750         catch(exception& e) {
751                 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
752                 exit(1);
753         }
754 }
755 /**************************************************************************************/
756 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {   
757         try {
758                 
759                 ifstream in;
760                 m->openInputFile(filename, in);
761                                 
762                 in.seekg(filePos->start);
763
764                 bool done = false;
765                 int count = 0;
766         
767                 while (!done) {
768                                 
769                         if (m->control_pressed) { in.close(); return 1; }
770                                         
771                         Sequence current(in); m->gobble(in);
772         
773                         if (current.getName() != "") {
774                                 int num = 1;
775                                 if (namefile != "") {
776                                         //make sure this sequence is in the namefile, else error 
777                                         map<string, int>::iterator it = nameMap.find(current.getName());
778                                         
779                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
780                                         else { num = it->second; }
781                                 }
782                                 
783                                 //for each sequence this sequence represents
784                                 for (int i = 0; i < num; i++) {
785                                         startPosition.push_back(current.getStartPos());
786                                         endPosition.push_back(current.getEndPos());
787                                         seqLength.push_back(current.getNumBases());
788                                         ambigBases.push_back(current.getAmbigBases());
789                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
790                                 }
791                                 
792                                 count++;
793                         }
794                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
795                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
796                                 unsigned long long pos = in.tellg();
797                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
798                         #else
799                                 if (in.eof()) { break; }
800                         #endif
801                         
802                 }
803                 
804                 in.close();
805                 
806                 return count;
807         }
808         catch(exception& e) {
809                 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
810                 exit(1);
811         }
812 }
813 /**************************************************************************************************/
814 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
815         try {
816 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
817                 int process = 1;
818                 int num = 0;
819                 processIDS.clear();
820                 
821                 //loop through and create all the processes you want
822                 while (process != processors) {
823                         int pid = fork();
824                         
825                         if (pid > 0) {
826                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
827                                 process++;
828                         }else if (pid == 0){
829                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]);
830                                 
831                                 //pass numSeqs to parent
832                                 ofstream out;
833                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
834                                 m->openOutputFile(tempFile, out);
835                                 
836                                 out << num << endl;
837                                 out << startPosition.size() << endl;
838                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
839                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
840                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
841                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
842                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
843                                 
844                                 out.close();
845                                 
846                                 exit(0);
847                         }else { 
848                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
849                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
850                                 exit(0);
851                         }
852                 }
853                 
854                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
855                 
856                 //force parent to wait until all the processes are done
857                 for (int i=0;i<processIDS.size();i++) { 
858                         int temp = processIDS[i];
859                         wait(&temp);
860                 }
861                 
862                 //parent reads in and combine Filter info
863                 for (int i = 0; i < processIDS.size(); i++) {
864                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
865                         ifstream in;
866                         m->openInputFile(tempFilename, in);
867                         
868                         int temp, tempNum;
869                         in >> tempNum; m->gobble(in); num += tempNum;
870                         in >> tempNum; m->gobble(in);
871                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
872                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
873                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
874                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
875                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
876                                 
877                         in.close();
878                         m->mothurRemove(tempFilename);
879                 }
880                 
881                 return num;
882 #endif          
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
886                 exit(1);
887         }
888 }
889
890 //***************************************************************************************************************
891
892 int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
893         try {
894                 ifstream inputGroups;
895                 m->openInputFile(groupfile, inputGroups);
896                 string seqName, group;
897                 set<string>::iterator it;
898                 
899                 string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile);
900                 outputNames.push_back(goodGroupFile);  outputTypes["group"].push_back(goodGroupFile);
901                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
902                 
903                 while(!inputGroups.eof()){
904                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
905
906                         inputGroups >> seqName >> group;
907                         it = badSeqNames.find(seqName);
908                         
909                         if(it != badSeqNames.end()){
910                                 badSeqNames.erase(it);
911                         }
912                         else{
913                                 goodGroupOut << seqName << '\t' << group << endl;
914                         }
915                         m->gobble(inputGroups);
916                 }
917                 
918                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); m->mothurRemove(goodGroupFile);  return 0; }
919
920                 //we were unable to remove some of the bad sequences
921                 if (badSeqNames.size() != 0) {
922                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
923                                 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
924                                 m->mothurOutEndLine();
925                         }
926                 }
927                 
928                 inputGroups.close();
929                 goodGroupOut.close();
930                 
931                 if (m->control_pressed) { m->mothurRemove(goodGroupFile);   }
932                 
933                 return 0;
934         
935         }
936         catch(exception& e) {
937                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
938                 exit(1);
939         }
940 }
941
942 //***************************************************************************************************************
943
944 int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
945         try {
946                 ifstream inputAlignReport;
947                 m->openInputFile(alignreport, inputAlignReport);
948                 string seqName, group;
949                 set<string>::iterator it;
950                 
951                 string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport);
952                 outputNames.push_back(goodAlignReportFile);  outputTypes["alignreport"].push_back(goodAlignReportFile);
953                 ofstream goodAlignReportOut;    m->openOutputFile(goodAlignReportFile, goodAlignReportOut);
954
955                 while (!inputAlignReport.eof()) {               //      need to copy header
956                         char c = inputAlignReport.get();
957                         goodAlignReportOut << c;
958                         if (c == 10 || c == 13){        break;  }       
959                 }
960
961                 while(!inputAlignReport.eof()){
962                         if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; }
963
964                         inputAlignReport >> seqName;
965                         it = badSeqNames.find(seqName);
966                         string line;            
967                         while (!inputAlignReport.eof()) {               //      need to copy header
968                                 char c = inputAlignReport.get();
969                                 line += c;
970                                 if (c == 10 || c == 13){        break;  }       
971                         }
972                         
973                         if(it != badSeqNames.end()){
974                                 badSeqNames.erase(it);
975                         }
976                         else{
977                                 goodAlignReportOut << seqName << '\t' << line;
978                         }
979                         m->gobble(inputAlignReport);
980                 }
981                 
982                 if (m->control_pressed) { goodAlignReportOut.close();  inputAlignReport.close(); m->mothurRemove(goodAlignReportFile);  return 0; }
983
984                 //we were unable to remove some of the bad sequences
985                 if (badSeqNames.size() != 0) {
986                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
987                                 m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct."); 
988                                 m->mothurOutEndLine();
989                         }
990                 }
991
992                 inputAlignReport.close();
993                 goodAlignReportOut.close();
994                                 
995                 if (m->control_pressed) {  m->mothurRemove(goodAlignReportFile);  return 0; }
996                 
997                 return 0;
998         
999         }
1000         catch(exception& e) {
1001                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
1002                 exit(1);
1003         }
1004         
1005 }
1006 //***************************************************************************************************************
1007
1008 int ScreenSeqsCommand::screenTaxonomy(set<string> badSeqNames){
1009         try {
1010                 ifstream input;
1011                 m->openInputFile(taxonomy, input);
1012                 string seqName, tax;
1013                 set<string>::iterator it;
1014                 
1015                 string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + "good" + m->getExtension(taxonomy);
1016                 outputNames.push_back(goodTaxFile);  outputTypes["taxonomy"].push_back(goodTaxFile);
1017                 ofstream goodTaxOut;    m->openOutputFile(goodTaxFile, goodTaxOut);
1018                                 
1019                 while(!input.eof()){
1020                         if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1021                         
1022                         input >> seqName >> tax;
1023                         it = badSeqNames.find(seqName);
1024                         
1025                         if(it != badSeqNames.end()){ badSeqNames.erase(it); }
1026                         else{
1027                                 goodTaxOut << seqName << '\t' << tax << endl;
1028                         }
1029                         m->gobble(input);
1030                 }
1031                 
1032                 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
1033                 
1034                 //we were unable to remove some of the bad sequences
1035                 if (badSeqNames.size() != 0) {
1036                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1037                                 m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct."); 
1038                                 m->mothurOutEndLine();
1039                         }
1040                 }
1041                 
1042                 input.close();
1043                 goodTaxOut.close();
1044                 
1045                 if (m->control_pressed) {  m->mothurRemove(goodTaxFile);  return 0; }
1046                 
1047                 return 0;
1048                 
1049         }
1050         catch(exception& e) {
1051                 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
1052                 exit(1);
1053         }
1054         
1055 }
1056 //***************************************************************************************************************
1057
1058 int ScreenSeqsCommand::screenQual(set<string> badSeqNames){
1059         try {
1060                 ifstream in;
1061                 m->openInputFile(qualfile, in);
1062                 set<string>::iterator it;
1063                 
1064                 string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile);
1065                 outputNames.push_back(goodQualFile);  outputTypes["qfile"].push_back(goodQualFile);
1066                 ofstream goodQual;      m->openOutputFile(goodQualFile, goodQual);
1067                 
1068                 while(!in.eof()){       
1069                         
1070                         if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
1071
1072                         string saveName = "";
1073                         string name = "";
1074                         string scores = "";
1075                         
1076                         in >> name; 
1077                         
1078                         if (name.length() != 0) { 
1079                                 saveName = name.substr(1);
1080                                 while (!in.eof())       {       
1081                                         char c = in.get(); 
1082                                         if (c == 10 || c == 13){        break;  }
1083                                         else { name += c; }     
1084                                 } 
1085                                 m->gobble(in);
1086                         }
1087                         
1088                         while(in){
1089                                 char letter= in.get();
1090                                 if(letter == '>'){      in.putback(letter);     break;  }
1091                                 else{ scores += letter; }
1092                         }
1093                         
1094                         m->gobble(in);
1095                         
1096                         it = badSeqNames.find(saveName);
1097                         
1098                         if(it != badSeqNames.end()){
1099                                 badSeqNames.erase(it);
1100                         }else{                          
1101                                 goodQual << name << endl << scores;
1102                         }
1103                         
1104                         m->gobble(in);
1105                 }
1106                 
1107                 in.close();
1108                 goodQual.close();
1109                 
1110                 //we were unable to remove some of the bad sequences
1111                 if (badSeqNames.size() != 0) {
1112                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1113                                 m->mothurOut("Your qual file does not include the sequence " + *it + " please correct."); 
1114                                 m->mothurOutEndLine();
1115                         }
1116                 }
1117                 
1118                 if (m->control_pressed) {  m->mothurRemove(goodQualFile);  return 0; }
1119                 
1120                 return 0;
1121                 
1122         }
1123         catch(exception& e) {
1124                 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
1125                 exit(1);
1126         }
1127         
1128 }
1129 //**********************************************************************************************************************
1130
1131 int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
1132         try {
1133                 ofstream goodFile;
1134                 m->openOutputFile(goodFName, goodFile);
1135                 
1136                 ofstream badAccnosFile;
1137                 m->openOutputFile(badAccnosFName, badAccnosFile);
1138                 
1139                 ifstream inFASTA;
1140                 m->openInputFile(filename, inFASTA);
1141
1142                 inFASTA.seekg(filePos->start);
1143
1144                 bool done = false;
1145                 int count = 0;
1146         
1147                 while (!done) {
1148                 
1149                         if (m->control_pressed) {  return 0; }
1150                         
1151                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
1152                         if (currSeq.getName() != "") {
1153                                 bool goodSeq = 1;               //      innocent until proven guilty
1154                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1155                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1156                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1157                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1158                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1159                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1160                                 
1161                                 if(goodSeq == 1){
1162                                         currSeq.printSequence(goodFile);        
1163                                 }
1164                                 else{
1165                                         badAccnosFile << currSeq.getName() << endl;
1166                                         badSeqNames.insert(currSeq.getName());
1167                                 }
1168                         count++;
1169                         }
1170                         
1171                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1172                                 unsigned long long pos = inFASTA.tellg();
1173                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
1174                         #else
1175                                 if (inFASTA.eof()) { break; }
1176                         #endif
1177                         
1178                         //report progress
1179                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1180                 }
1181                 //report progress
1182                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
1183                 
1184                         
1185                 goodFile.close();
1186                 inFASTA.close();
1187                 badAccnosFile.close();
1188                 
1189                 return count;
1190         }
1191         catch(exception& e) {
1192                 m->errorOut(e, "ScreenSeqsCommand", "driver");
1193                 exit(1);
1194         }
1195 }
1196 //**********************************************************************************************************************
1197 #ifdef USE_MPI
1198 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
1199         try {
1200                 string outputString = "";
1201                 MPI_Status statusGood; 
1202                 MPI_Status statusBadAccnos; 
1203                 MPI_Status status; 
1204                 int pid;
1205                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1206
1207                 for(int i=0;i<num;i++){
1208                 
1209                         if (m->control_pressed) {  return 0; }
1210                         
1211                         //read next sequence
1212                         int length = MPIPos[start+i+1] - MPIPos[start+i];
1213
1214                         char* buf4 = new char[length];
1215                         memcpy(buf4, outputString.c_str(), length);
1216
1217                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
1218                         
1219                         string tempBuf = buf4;  delete buf4;
1220                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
1221                         istringstream iss (tempBuf,istringstream::in);
1222                         
1223                         Sequence currSeq(iss);                  
1224                         
1225                         //process seq
1226                         if (currSeq.getName() != "") {
1227                                 bool goodSeq = 1;               //      innocent until proven guilty
1228                                 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
1229                                 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
1230                                 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
1231                                 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
1232                                 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
1233                                 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
1234                                 
1235                                 if(goodSeq == 1){
1236                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
1237                                 
1238                                         //print good seq
1239                                         length = outputString.length();
1240                                         char* buf2 = new char[length];
1241                                         memcpy(buf2, outputString.c_str(), length);
1242                                         
1243                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
1244                                         delete buf2;
1245                                 }
1246                                 else{
1247
1248                                         badSeqNames.insert(currSeq.getName());
1249                                         
1250                                         //write to bad accnos file
1251                                         outputString = currSeq.getName() + "\n";
1252                                 
1253                                         length = outputString.length();
1254                                         char* buf3 = new char[length];
1255                                         memcpy(buf3, outputString.c_str(), length);
1256                                         
1257                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
1258                                         delete buf3;
1259                                 }
1260                         }
1261                         
1262                         //report progress
1263                         if((i) % 100 == 0){     m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine();             }
1264                 }
1265                                 
1266                 return 1;
1267         }
1268         catch(exception& e) {
1269                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
1270                 exit(1);
1271         }
1272 }
1273 #endif
1274 /**************************************************************************************************/
1275
1276 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
1277         try {
1278 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1279                 int process = 0;
1280                 int num = 0;
1281                 
1282                 //loop through and create all the processes you want
1283                 while (process != processors) {
1284                         int pid = fork();
1285                         
1286                         if (pid > 0) {
1287                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1288                                 process++;
1289                         }else if (pid == 0){
1290                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
1291                                 
1292                                 //pass numSeqs to parent
1293                                 ofstream out;
1294                                 string tempFile = filename + toString(getpid()) + ".num.temp";
1295                                 m->openOutputFile(tempFile, out);
1296                                 out << num << endl;
1297                                 out.close();
1298                                 
1299                                 exit(0);
1300                         }else { 
1301                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1302                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1303                                 exit(0);
1304                         }
1305                 }
1306                 
1307                 //force parent to wait until all the processes are done
1308                 for (int i=0;i<processors;i++) { 
1309                         int temp = processIDS[i];
1310                         wait(&temp);
1311                 }
1312                 
1313                 for (int i = 0; i < processIDS.size(); i++) {
1314                         ifstream in;
1315                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
1316                         m->openInputFile(tempFile, in);
1317                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
1318                         in.close(); m->mothurRemove(tempFile);
1319                 }
1320                 
1321                 return num;
1322 #endif          
1323         }
1324         catch(exception& e) {
1325                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
1326                 exit(1);
1327         }
1328 }
1329
1330 //***************************************************************************************************************
1331
1332