]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
17         CommandParameter pcontigsreport("contigsreport", "InputTypes", "", "", "report", "none", "none","contigsreport",false,true,true); parameters.push_back(pcontigsreport);
18         CommandParameter palignreport("alignreport", "InputTypes", "", "", "report", "none", "none","alignreport",false,false); parameters.push_back(palignreport);
19         CommandParameter psummary("summary", "InputTypes", "", "", "report", "none", "none","summary",false,false); parameters.push_back(psummary);
20         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
21         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
22                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
23                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile);
24                 
25                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax);
26                 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart);
27                 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend);
28                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
29                 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxhomop);
30                 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminlength);
31                 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxlength);
32                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
33                 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria);
34                 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize);
35         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37         
38         //report parameters
39         CommandParameter pminoverlap("minoverlap", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminoverlap);
40         CommandParameter postart("ostart", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(postart);
41         CommandParameter poend("oend", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(poend);
42         CommandParameter pmismatches("mismatches", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmismatches);
43         CommandParameter pmaxn("maxn", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxn);
44         CommandParameter pminscore("minscore", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminscore);
45         CommandParameter pmaxinsert("maxinsert", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxinsert);
46         CommandParameter pminsim("minsim", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminsim);
47
48                 
49                 
50                 vector<string> myArray;
51                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string ScreenSeqsCommand::getHelpString(){      
61         try {
62                 string helpString = "";
63                 helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
64                 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, contigsreport, summary, taxonomy, optimize, criteria and processors.\n";
65                 helpString += "The fasta parameter is required.\n";
66         helpString += "The contigsreport parameter allows you to use the contigsreport file to determine if a sequence is good. Screening parameters include: minoverlap, ostart, oend and mismatches. \n";
67         helpString += "The alignreport parameter allows you to use the alignreport file to determine if a sequence is good. Screening parameters include: minsim, minscore and maxinsert. \n";
68         helpString += "The summary parameter allows you to use the summary file from summary.seqs to save time processing.\n";
69                 helpString += "The taxonomy parameter allows you to remove bad seqs from taxonomy files.\n";
70                 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
71                 helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
72                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
73                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
74                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
75                 helpString += "The maxn parameter allows you to set and maximum number of N's allowed in a sequence. \n";
76         helpString += "The minoverlap parameter allows you to set and minimum overlap. The default is -1. \n";
77         helpString += "The ostart parameter is used to set an overlap position the \"good\" sequences must start by. The default is -1. \n";
78         helpString += "The oend parameter is used to set an overlap position the \"good\" sequences must end after. The default is -1.\n";
79         helpString += "The mismatches parameter allows you to set and maximum mismatches in the contigs.report. \n";
80         helpString += "The minsim parameter allows you to set the minimum similarity to template sequences during alignment. Found in column \'SimBtwnQuery&Template\' in align.report file.\n";
81         helpString += "The minscore parameter allows you to set the minimum search score during alignment. Found in column \'SearchScore\' in align.report file.\n";
82         helpString += "The maxinsert parameter allows you to set the maximum number of insertions during alignment. Found in column \'LongestInsert\' in align.report file.\n";
83                 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
84                 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
85                 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
86                 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
87                 helpString += "The screen.seqs command should be in the following format: \n";
88                 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n";
89                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
90                 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92                 return helpString;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
96                 exit(1);
97         }
98 }
99 //**********************************************************************************************************************
100 string ScreenSeqsCommand::getOutputPattern(string type) {
101     try {
102         string pattern = "";
103         
104         if (type == "fasta")            {   pattern = "[filename],good,[extension]";    }
105         else if (type == "taxonomy")    {   pattern = "[filename],good,[extension]";    }
106         else if (type == "name")        {   pattern = "[filename],good,[extension]";    }
107         else if (type == "group")       {   pattern = "[filename],good,[extension]";    }
108         else if (type == "count")       {   pattern = "[filename],good,[extension]";    }
109         else if (type == "accnos")      {   pattern = "[filename],bad.accnos";          }
110         else if (type == "qfile")       {   pattern = "[filename],good,[extension]";    }
111         else if (type == "alignreport")      {   pattern = "[filename],good.align.report";    }
112         else if (type == "contigsreport")      {   pattern = "[filename],good.contigs.report";    }
113         else if (type == "summary")      {   pattern = "[filename],good.summary";    }
114         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
115         
116         return pattern;
117     }
118     catch(exception& e) {
119         m->errorOut(e, "ScreenSeqsCommand", "getOutputPattern");
120         exit(1);
121     }
122 }
123 //**********************************************************************************************************************
124 ScreenSeqsCommand::ScreenSeqsCommand(){ 
125         try {
126                 abort = true; calledHelp = true; 
127                 setParameters();
128                 vector<string> tempOutNames;
129                 outputTypes["fasta"] = tempOutNames;
130                 outputTypes["name"] = tempOutNames;
131                 outputTypes["group"] = tempOutNames;
132                 outputTypes["alignreport"] = tempOutNames;
133         outputTypes["contigsreport"] = tempOutNames;
134         outputTypes["summary"] = tempOutNames;
135                 outputTypes["accnos"] = tempOutNames;
136                 outputTypes["qfile"] = tempOutNames;
137                 outputTypes["taxonomy"] = tempOutNames;
138         outputTypes["count"] = tempOutNames;
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
142                 exit(1);
143         }
144 }
145 //***************************************************************************************************************
146
147 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
148         try {
149                 abort = false; calledHelp = false;   
150                 
151                 //allow user to run help
152                 if(option == "help") { help(); abort = true; calledHelp = true; }
153                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
154                 
155                 else {
156                         vector<string> myArray = setParameters();
157                         
158                         OptionParser parser(option);
159                         map<string,string> parameters = parser.getParameters();
160                         
161                         ValidParameters validParameter("screen.seqs");
162                         map<string,string>::iterator it;
163                         
164                         //check to make sure all parameters are valid for command
165                         for (it = parameters.begin(); it != parameters.end(); it++) { 
166                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
167                         }
168                         
169                         //initialize outputTypes
170                         vector<string> tempOutNames;
171                         outputTypes["fasta"] = tempOutNames;
172                         outputTypes["name"] = tempOutNames;
173                         outputTypes["group"] = tempOutNames;
174                         outputTypes["alignreport"] = tempOutNames;
175                         outputTypes["accnos"] = tempOutNames;
176                         outputTypes["qfile"] = tempOutNames;
177                         outputTypes["taxonomy"] = tempOutNames;
178             outputTypes["count"] = tempOutNames;
179                         outputTypes["contigsreport"] = tempOutNames;
180             outputTypes["summary"] = tempOutNames;
181
182             
183                         //if the user changes the input directory command factory will send this info to us in the output parameter 
184                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
185                         if (inputDir == "not found"){   inputDir = "";          }
186                         else {
187                                 string path;
188                                 it = parameters.find("fasta");
189                                 //user has given a template file
190                                 if(it != parameters.end()){ 
191                                         path = m->hasPath(it->second);
192                                         //if the user has not given a path then, add inputdir. else leave path alone.
193                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
194                                 }
195                                 
196                                 it = parameters.find("group");
197                                 //user has given a template file
198                                 if(it != parameters.end()){ 
199                                         path = m->hasPath(it->second);
200                                         //if the user has not given a path then, add inputdir. else leave path alone.
201                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
202                                 }
203                                 
204                                 it = parameters.find("name");
205                                 //user has given a template file
206                                 if(it != parameters.end()){ 
207                                         path = m->hasPath(it->second);
208                                         //if the user has not given a path then, add inputdir. else leave path alone.
209                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
210                                 }
211                                 
212                                 it = parameters.find("alignreport");
213                                 //user has given a template file
214                                 if(it != parameters.end()){ 
215                                         path = m->hasPath(it->second);
216                                         //if the user has not given a path then, add inputdir. else leave path alone.
217                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
218                                 }
219                 
220                 it = parameters.find("contigsreport");
221                                 //user has given a template file
222                                 if(it != parameters.end()){ 
223                                         path = m->hasPath(it->second);
224                                         //if the user has not given a path then, add inputdir. else leave path alone.
225                                         if (path == "") {       parameters["contigsreport"] = inputDir + it->second;            }
226                                 }
227                 
228                 it = parameters.find("summary");
229                                 //user has given a template file
230                                 if(it != parameters.end()){ 
231                                         path = m->hasPath(it->second);
232                                         //if the user has not given a path then, add inputdir. else leave path alone.
233                                         if (path == "") {       parameters["summary"] = inputDir + it->second;          }
234                                 }
235                                 
236                                 it = parameters.find("qfile");
237                                 //user has given a template file
238                                 if(it != parameters.end()){ 
239                                         path = m->hasPath(it->second);
240                                         //if the user has not given a path then, add inputdir. else leave path alone.
241                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
242                                 }
243                                 
244                                 it = parameters.find("taxonomy");
245                                 //user has given a template file
246                                 if(it != parameters.end()){ 
247                                         path = m->hasPath(it->second);
248                                         //if the user has not given a path then, add inputdir. else leave path alone.
249                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
250                                 }
251                 
252                 it = parameters.find("count");
253                                 //user has given a template file
254                                 if(it != parameters.end()){ 
255                                         path = m->hasPath(it->second);
256                                         //if the user has not given a path then, add inputdir. else leave path alone.
257                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
258                                 }
259                         }
260
261                         //check for required parameters
262                         fastafile = validParameter.validFile(parameters, "fasta", true);
263                         if (fastafile == "not found") {                         
264                                 fastafile = m->getFastaFile(); 
265                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
266                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
267                         }
268                         else if (fastafile == "not open") { abort = true; }
269                         else { m->setFastaFile(fastafile); }
270         
271                         groupfile = validParameter.validFile(parameters, "group", true);
272                         if (groupfile == "not open") { abort = true; }  
273                         else if (groupfile == "not found") { groupfile = ""; }
274                         else { m->setGroupFile(groupfile); }
275                         
276                         qualfile = validParameter.validFile(parameters, "qfile", true);
277                         if (qualfile == "not open") { abort = true; }   
278                         else if (qualfile == "not found") { qualfile = ""; }
279                         else { m->setQualFile(qualfile); }
280                         
281                         namefile = validParameter.validFile(parameters, "name", true);
282                         if (namefile == "not open") { namefile = ""; abort = true; }
283                         else if (namefile == "not found") { namefile = ""; }    
284                         else { m->setNameFile(namefile); }
285                         
286             countfile = validParameter.validFile(parameters, "count", true);
287                         if (countfile == "not open") { countfile = ""; abort = true; }
288                         else if (countfile == "not found") { countfile = "";  } 
289                         else { m->setCountTableFile(countfile); }
290             
291             contigsreport = validParameter.validFile(parameters, "contigsreport", true);
292                         if (contigsreport == "not open") { contigsreport = ""; abort = true; }
293                         else if (contigsreport == "not found") { contigsreport = "";  } 
294             
295             summaryfile = validParameter.validFile(parameters, "summary", true);
296                         if (summaryfile == "not open") { summaryfile = ""; abort = true; }
297                         else if (summaryfile == "not found") { summaryfile = "";  }     
298             
299             if ((namefile != "") && (countfile != "")) {
300                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
301             }
302                         
303             if ((groupfile != "") && (countfile != "")) {
304                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
305             }
306             
307                         alignreport = validParameter.validFile(parameters, "alignreport", true);
308                         if (alignreport == "not open") { abort = true; }
309                         else if (alignreport == "not found") { alignreport = ""; }
310                         
311                         taxonomy = validParameter.validFile(parameters, "taxonomy", true);
312                         if (taxonomy == "not open") { abort = true; }
313                         else if (taxonomy == "not found") { taxonomy = ""; }    
314                         
315                         //if the user changes the output directory command factory will send this info to us in the output parameter 
316                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
317                                 outputDir = ""; 
318                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
319                         }
320
321                         //check for optional parameter and set defaults
322                         // ...at some point should added some additional type checking...
323                         string temp;
324                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
325                         m->mothurConvert(temp, startPos); 
326                 
327                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
328                         m->mothurConvert(temp, endPos);  
329
330                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
331                         m->mothurConvert(temp, maxAmbig);  
332
333                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
334                         m->mothurConvert(temp, maxHomoP);  
335
336                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
337                         m->mothurConvert(temp, minLength); 
338                         
339                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
340                         m->mothurConvert(temp, maxLength); 
341                         
342                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
343                         m->setProcessors(temp);
344                         m->mothurConvert(temp, processors);
345                         
346             temp = validParameter.validFile(parameters, "minoverlap", false);   if (temp == "not found") { temp = "-1"; }
347                         m->mothurConvert(temp, minOverlap); 
348             
349             temp = validParameter.validFile(parameters, "ostart", false);       if (temp == "not found") { temp = "-1"; }
350                         m->mothurConvert(temp, oStart); 
351             
352             temp = validParameter.validFile(parameters, "oend", false); if (temp == "not found") { temp = "-1"; }
353                         m->mothurConvert(temp, oEnd); 
354             
355             temp = validParameter.validFile(parameters, "mismatches", false);   if (temp == "not found") { temp = "-1"; }
356                         m->mothurConvert(temp, mismatches); 
357             
358             temp = validParameter.validFile(parameters, "maxn", false); if (temp == "not found") { temp = "-1"; }
359                         m->mothurConvert(temp, maxN); 
360             
361             temp = validParameter.validFile(parameters, "minscore", false);     if (temp == "not found") { temp = "-1"; }
362                         m->mothurConvert(temp, minScore); 
363             
364             temp = validParameter.validFile(parameters, "maxinsert", false);    if (temp == "not found") { temp = "-1"; }
365                         m->mothurConvert(temp, maxInsert); 
366             
367             temp = validParameter.validFile(parameters, "minsim", false);       if (temp == "not found") { temp = "-1"; }
368                         m->mothurConvert(temp, minSim); 
369             
370                         temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
371                         if (temp == "not found"){       temp = "none";          }
372                         m->splitAtDash(temp, optimize);         
373             
374             if ((contigsreport != "") && ((summaryfile != "") || ( alignreport != ""))) {
375                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
376             }
377             
378             if ((alignreport != "") && ((summaryfile != "") || ( contigsreport != ""))) {
379                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
380             }
381             
382             if ((summaryfile != "") && ((alignreport != "") || ( contigsreport != ""))) {
383                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
384             }
385                         
386             //check to make sure you have the files you need for certain screening
387             if ((contigsreport == "") && ((minOverlap != -1) || (oStart != -1) || (oEnd != -1) || (mismatches != -1))) {
388                 m->mothurOut("[ERROR]: minoverlap, ostart, oend and mismatches can only be used with a contigs.report file, aborting.\n"); abort=true;
389             }
390             
391             if ((alignreport == "") && ((minScore != -1) || (maxInsert != -1) || (minSim != -1))) {
392                 m->mothurOut("[ERROR]: minscore, maxinsert and minsim can only be used with a align.report file, aborting.\n"); abort=true;
393             }
394             
395                         //check for invalid optimize options
396                         set<string> validOptimizers;
397                         validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); validOptimizers.insert("maxn");
398             if (contigsreport != "")    { validOptimizers.insert("minoverlap"); validOptimizers.insert("ostart"); validOptimizers.insert("oend"); validOptimizers.insert("mismatches");  }
399             if (alignreport != "")      { validOptimizers.insert("minscore"); validOptimizers.insert("maxinsert"); validOptimizers.insert("minsim"); }
400             
401                         for (int i = 0; i < optimize.size(); i++) { 
402                                 if (validOptimizers.count(optimize[i]) == 0) { 
403                                         m->mothurOut(optimize[i] + " is not a valid optimizer with your input files. Valid options are "); 
404                     string valid = "";
405                     for (set<string>::iterator it = validOptimizers.begin(); it != validOptimizers.end(); it++) {
406                         valid += (*it) + ", ";
407                     }
408                     if (valid.length() != 0) {  valid = valid.substr(0, valid.length()-2); }
409                     m->mothurOut(valid + ".");
410                     m->mothurOutEndLine();
411                                         optimize.erase(optimize.begin()+i);
412                                         i--;
413                                 }
414                         }
415                         
416                         if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
417                         
418                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
419                         m->mothurConvert(temp, criteria); 
420                         
421                         if (countfile == "") { 
422                 if (namefile == "") {
423                     vector<string> files; files.push_back(fastafile);
424                     parser.getNameFile(files);
425                 }
426             }
427                 }
428
429         }
430         catch(exception& e) {
431                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
432                 exit(1);
433         }
434 }
435 //***************************************************************************************************************
436
437 int ScreenSeqsCommand::execute(){
438         try{
439                 
440                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
441                 
442         map<string, string> badSeqNames;
443         int start = time(NULL);
444         int numFastaSeqs = 0;
445         
446         if ((contigsreport == "") && (summaryfile == "") && (alignreport == "")) {   numFastaSeqs = screenFasta(badSeqNames);  }
447         else {   numFastaSeqs = screenReports(badSeqNames);   }
448                 
449         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
450         
451         #ifdef USE_MPI
452             int pid;
453             MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
454         
455             if (pid == 0) { //only one process should fix files
456         #endif  
457                 
458                 if(namefile != "" && groupfile != "")   {       
459                         screenNameGroupFile(badSeqNames);       
460                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
461                 }else if(namefile != "")        {       
462                         screenNameGroupFile(badSeqNames);
463                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }       
464                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
465                 else if (countfile != "") {     screenCountFile(badSeqNames);           }
466             
467                 
468                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
469
470                 if(qualfile != "")                                              {       screenQual(badSeqNames);                        }
471                 if(taxonomy != "")                                              {       screenTaxonomy(badSeqNames);            }
472                 
473                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
474                 
475                 #ifdef USE_MPI
476                         }
477                 #endif
478
479                 m->mothurOutEndLine();
480                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
481                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
482                 m->mothurOutEndLine();
483                 m->mothurOutEndLine();
484                 
485                 //set fasta file as new current fastafile
486                 string current = "";
487                 itTypes = outputTypes.find("fasta");
488                 if (itTypes != outputTypes.end()) {
489                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
490                 }
491                 
492                 itTypes = outputTypes.find("name");
493                 if (itTypes != outputTypes.end()) {
494                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
495                 }
496                 
497                 itTypes = outputTypes.find("group");
498                 if (itTypes != outputTypes.end()) {
499                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
500                 }
501                 
502                 itTypes = outputTypes.find("qfile");
503                 if (itTypes != outputTypes.end()) {
504                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
505                 }
506                 
507                 itTypes = outputTypes.find("taxonomy");
508                 if (itTypes != outputTypes.end()) {
509                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
510                 }
511         
512         itTypes = outputTypes.find("count");
513                 if (itTypes != outputTypes.end()) {
514                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
515                 }
516
517                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
518                 m->mothurOutEndLine();
519
520                 return 0;
521         }
522         catch(exception& e) {
523                 m->errorOut(e, "ScreenSeqsCommand", "execute");
524                 exit(1);
525         }
526 }
527 //***************************************************************************************************************/
528 int ScreenSeqsCommand::runFastaScreening(map<string, string>& badSeqNames){
529         try{
530         int numFastaSeqs = 0;
531         map<string, string> variables; 
532         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
533         string badAccnosFile =  getOutputFileName("accnos",variables);
534         variables["[extension]"] = m->getExtension(fastafile);
535                 string goodSeqFile = getOutputFileName("fasta", variables);
536                 outputNames.push_back(goodSeqFile); outputTypes["fasta"].push_back(goodSeqFile);
537                 outputNames.push_back(badAccnosFile); outputTypes["accnos"].push_back(badAccnosFile);
538         
539 #ifdef USE_MPI  
540         int pid, numSeqsPerProcessor; 
541         int tag = 2001;
542         vector<unsigned long long> MPIPos;
543         
544         MPI_Status status; 
545         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
546         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
547         
548         MPI_File inMPI;
549         MPI_File outMPIGood;
550         MPI_File outMPIBadAccnos;
551         
552         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
553         int inMode=MPI_MODE_RDONLY; 
554         
555         char outGoodFilename[1024];
556         strcpy(outGoodFilename, goodSeqFile.c_str());
557         
558         char outBadAccnosFilename[1024];
559         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
560         
561         char inFileName[1024];
562         strcpy(inFileName, fastafile.c_str());
563         
564         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
565         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
566         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
567         
568         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
569         
570         if (pid == 0) { //you are the root process 
571             
572             MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
573             
574             //send file positions to all processes
575             for(int i = 1; i < processors; i++) { 
576                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
577                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
578             }
579             
580             //figure out how many sequences you have to align
581             numSeqsPerProcessor = numFastaSeqs / processors;
582             int startIndex =  pid * numSeqsPerProcessor;
583             if(pid == (processors - 1)){        numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
584             
585             //align your part
586             driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
587             
588             if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
589             
590             for (int i = 1; i < processors; i++) {
591                 //get bad lists
592                 int badSize;
593                 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
594             }
595         }else{ //you are a child process
596             MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
597             MPIPos.resize(numFastaSeqs+1);
598             MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
599             
600             //figure out how many sequences you have to align
601             numSeqsPerProcessor = numFastaSeqs / processors;
602             int startIndex =  pid * numSeqsPerProcessor;
603             if(pid == (processors - 1)){        numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
604             
605             //align your part
606             driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
607             
608             if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
609             
610             //send bad list     
611             int badSize = badSeqNames.size();
612             MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
613         }
614         
615         //close files 
616         MPI_File_close(&inMPI);
617         MPI_File_close(&outMPIGood);
618         MPI_File_close(&outMPIBadAccnos);
619         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
620         
621 #else
622         if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);       }       
623         else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
624         
625         if (m->control_pressed) { m->mothurRemove(goodSeqFile); return numFastaSeqs; }
626 #endif          
627         
628 #ifdef USE_MPI
629         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
630         
631         if (pid == 0) { //only one process should fix files
632                         
633             //read accnos file with all names in it, process 0 just has its names
634             MPI_File inMPIAccnos;
635             MPI_Offset size;
636                         
637             char inFileName[1024];
638             strcpy(inFileName, badAccnosFile.c_str());
639                         
640             MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
641             MPI_File_get_size(inMPIAccnos, &size);
642                         
643             char* buffer = new char[size];
644             MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
645                         
646             string tempBuf = buffer;
647             if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
648             istringstream iss (tempBuf,istringstream::in);
649             
650             delete buffer;
651             MPI_File_close(&inMPIAccnos);
652             
653             badSeqNames.clear();
654             string tempName, trashCode;
655             while (!iss.eof()) {
656                 iss >> tempName >> trashCode; m->gobble(iss);
657                 badSeqNames[tempName] = trashCode;
658             }
659         }
660 #endif
661         
662         
663                 return numFastaSeqs;
664
665         }
666         catch(exception& e) {
667                 m->errorOut(e, "ScreenSeqsCommand", "runFastaScreening");
668                 exit(1);
669         }
670 }
671 //***************************************************************************************************************/
672 int ScreenSeqsCommand::screenReports(map<string, string>& badSeqNames){
673         try{
674         int numFastaSeqs = 0;
675         bool summarizedFasta = false;
676         
677         //did not provide a summary file, but set a parameter that requires summarizing the fasta file
678         //or did provide a summary file, but set maxn parameter so we must summarize the fasta file 
679         vector<unsigned long long> positions;
680         if (((summaryfile == "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize)))) || ((summaryfile != "") && m->inUsersGroups("maxn", optimize))) {  
681             //use the namefile to optimize correctly
682             if (namefile != "") { nameMap = m->readNames(namefile); }
683             else if (countfile != "") {
684                 CountTable ct;
685                 ct.readTable(countfile);
686                 nameMap = ct.getNameMap();
687             }
688             getSummary(positions); 
689             summarizedFasta = true;
690         } else {
691             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
692                 positions = m->divideFile(fastafile, processors);
693                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
694             #else 
695                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
696                 else {
697                     int numFastaSeqs = 0;
698                     positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
699                     if (positions.size() < processors) { processors = positions.size(); }
700                 
701                     //figure out how many sequences you have to process
702                     int numSeqsPerProcessor = numFastaSeqs / processors;
703                     for (int i = 0; i < processors; i++) {
704                         int startIndex =  i * numSeqsPerProcessor;
705                         if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
706                         lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
707                     }
708                 }
709             #endif
710         }
711         
712         if ((summaryfile != "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize))) && !summarizedFasta) { //summarize based on summaryfile
713             if (namefile != "") { nameMap = m->readNames(namefile); }
714             else if (countfile != "") {
715                 CountTable ct;
716                 ct.readTable(countfile);
717                 nameMap = ct.getNameMap();
718             }
719             getSummaryReport();
720         }else if ((contigsreport != "") && ((m->inUsersGroups("minoverlap", optimize)) || (m->inUsersGroups("ostart", optimize)) || (m->inUsersGroups("oend", optimize)) || (m->inUsersGroups("mismatches", optimize)))) { //optimize settings based on contigs file
721             optimizeContigs();
722         }else if ((alignreport != "") && ((m->inUsersGroups("minsim", optimize)) || (m->inUsersGroups("minscore", optimize)) || (m->inUsersGroups("maxinsert", optimize)))) { //optimize settings based on contigs file
723             optimizeAlign();
724         }
725         
726         
727         //provided summary file, and did not set maxn so no need to summarize fasta
728         if (summaryfile != "")      {   numFastaSeqs = screenSummary(badSeqNames);  }
729         //add in any seqs that fail due to contigs report results
730         else if (contigsreport != "")    {   numFastaSeqs = screenContigs(badSeqNames);  }
731         //add in any seqs that fail due to align report
732         else if (alignreport != "")      {   numFastaSeqs = screenAlignReport(badSeqNames);  }
733         
734         return numFastaSeqs;
735     }
736         catch(exception& e) {
737                 m->errorOut(e, "ScreenSeqsCommand", "screenReports");
738                 exit(1);
739         }
740 }
741 //***************************************************************************************************************
742 int ScreenSeqsCommand::screenAlignReport(map<string, string>& badSeqNames){
743         try {
744         
745         map<string, string> variables; 
746         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
747         string outSummary =  getOutputFileName("alignreport",variables);
748                 outputNames.push_back(outSummary); outputTypes["alignreport"].push_back(outSummary);
749         
750         string name, TemplateName, SearchMethod, AlignmentMethod;
751         //QueryName     QueryLength     TemplateName    TemplateLength  SearchMethod    SearchScore     AlignmentMethod QueryStart      QueryEnd        TemplateStart   TemplateEnd     PairwiseAlignmentLength GapsInQuery     GapsInTemplate  LongestInsert   SimBtwnQuery&Template
752         //checking for minScore, maxInsert, minSim
753         int length, TemplateLength,      QueryStart,    QueryEnd,       TemplateStart,  TemplateEnd,    PairwiseAlignmentLength,        GapsInQuery,    GapsInTemplate, LongestInsert;
754         float SearchScore, SimBtwnQueryTemplate;
755         
756         ofstream out;
757         m->openOutputFile(outSummary, out);
758         
759         //read summary file
760         ifstream in;
761         m->openInputFile(alignreport, in);
762         out << (m->getline(in)) << endl;   //skip headers
763         
764                 int count = 0;
765         
766                 while (!in.eof()) {
767             
768             if (m->control_pressed) { in.close(); out.close(); return 0; }
769             
770             //seqname   start   end     nbases  ambigs  polymer numSeqs
771             in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
772
773             bool goodSeq = 1;           //      innocent until proven guilty
774             string trashCode = "";
775             if(maxInsert != -1 && maxInsert < LongestInsert)    {       goodSeq = 0; trashCode += "insert|";    }
776             if(minScore != -1 && minScore > SearchScore)                {       goodSeq = 0; trashCode += "score|";     }
777             if(minSim != -1 && minSim > SimBtwnQueryTemplate)   {       goodSeq = 0; trashCode += "sim|";       }
778             
779             if(goodSeq == 1){
780                 out << name << '\t' << length << '\t' << TemplateName  << '\t' << TemplateLength  << '\t' << SearchMethod  << '\t' << SearchScore  << '\t' << AlignmentMethod  << '\t' << QueryStart  << '\t' << QueryEnd  << '\t' << TemplateStart  << '\t' << TemplateEnd  << '\t' << PairwiseAlignmentLength  << '\t' << GapsInQuery  << '\t' << GapsInTemplate  << '\t' << LongestInsert  << '\t' << SimBtwnQueryTemplate << endl;
781             }
782             else{ badSeqNames[name] = trashCode;  }
783             count++;
784         }
785         in.close();
786         out.close();
787         
788         int oldBadSeqsCount = badSeqNames.size();
789         
790         int numFastaSeqs = runFastaScreening(badSeqNames);
791         
792         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
793             m->renameFile(outSummary, outSummary+".temp");
794             
795             ofstream out2;
796             m->openOutputFile(outSummary, out2);
797             
798             //read summary file
799             ifstream in2;
800             m->openInputFile(outSummary+".temp", in2);
801             out2 << (m->getline(in2)) << endl;   //skip headers
802             
803             while (!in2.eof()) {
804                 
805                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
806                 
807                 //seqname       start   end     nbases  ambigs  polymer numSeqs
808                 in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2);
809                 
810                 if (badSeqNames.count(name) == 0) { //are you good?
811                     out2 << name << '\t' << length << '\t' << TemplateName  << '\t' << TemplateLength  << '\t' << SearchMethod  << '\t' << SearchScore  << '\t' << AlignmentMethod  << '\t' << QueryStart  << '\t' << QueryEnd  << '\t' << TemplateStart  << '\t' << TemplateEnd  << '\t' << PairwiseAlignmentLength  << '\t' << GapsInQuery  << '\t' << GapsInTemplate  << '\t' << LongestInsert  << '\t' << SimBtwnQueryTemplate << endl;             
812                 }
813             }
814             in2.close();
815             out2.close();
816             m->mothurRemove(outSummary+".temp");
817         }
818         
819         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
820         
821         
822         return count;
823         
824                 return 0;
825         
826         }
827         catch(exception& e) {
828                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
829                 exit(1);
830         }
831         
832 }
833 //***************************************************************************************************************/
834 int ScreenSeqsCommand::screenContigs(map<string, string>& badSeqNames){
835         try{
836         map<string, string> variables; 
837         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport));
838         string outSummary =  getOutputFileName("contigsreport",variables);
839                 outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary);
840         
841         string name;
842         //Name  Length  Overlap_Length  Overlap_Start   Overlap_End     MisMatches      Num_Ns
843         int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs;
844         
845         ofstream out;
846         m->openOutputFile(outSummary, out);
847         
848         //read summary file
849         ifstream in;
850         m->openInputFile(contigsreport, in);
851         out << (m->getline(in)) << endl;   //skip headers
852         
853                 int count = 0;
854         
855                 while (!in.eof()) {
856             
857             if (m->control_pressed) { in.close(); out.close(); return 0; }
858             
859             //seqname   start   end     nbases  ambigs  polymer numSeqs
860             in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in);
861             
862             bool goodSeq = 1;           //      innocent until proven guilty
863             string trashCode = "";
864             if(oStart != -1 && oStart < thisOStart)             {       goodSeq = 0;    trashCode += "ostart|";     }
865             if(oEnd != -1 && oEnd > thisOEnd)                   {       goodSeq = 0;    trashCode += "oend|";       }
866             if(maxN != -1 && maxN <     numNs)                      {   goodSeq = 0;    trashCode += "n|";          }
867             if(minOverlap != -1 && minOverlap > OLength)                {       goodSeq = 0;    trashCode += "olength|";    }
868             if(mismatches != -1 && mismatches < numMisMatches)  {       goodSeq = 0;    trashCode += "mismatches|"; }
869             
870             if(goodSeq == 1){
871                 out << name << '\t' << length  << '\t' << OLength  << '\t' << thisOStart  << '\t' << thisOEnd  << '\t' << numMisMatches  << '\t' << numNs << endl;      
872             }
873             else{ badSeqNames[name] = trashCode; }
874             count++;
875         }
876         in.close();
877         out.close();
878         
879         int oldBadSeqsCount = badSeqNames.size();
880         
881         int numFastaSeqs = runFastaScreening(badSeqNames);
882         
883         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
884             m->renameFile(outSummary, outSummary+".temp");
885             
886             ofstream out2;
887             m->openOutputFile(outSummary, out2);
888             
889             //read summary file
890             ifstream in2;
891             m->openInputFile(outSummary+".temp", in2);
892             out2 << (m->getline(in2)) << endl;   //skip headers
893             
894             while (!in2.eof()) {
895                 
896                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
897                 
898                 //seqname       start   end     nbases  ambigs  polymer numSeqs
899                 in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2);
900                 
901                 if (badSeqNames.count(name) == 0) { //are you good?
902                     out2 << name << '\t' << length  << '\t' << OLength  << '\t' << thisOStart  << '\t' << thisOEnd  << '\t' << numMisMatches  << '\t' << numNs << endl;         
903                 }
904             }
905             in2.close();
906             out2.close();
907             m->mothurRemove(outSummary+".temp");
908         }
909         
910         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
911         
912         
913         return count;
914         
915     }
916         catch(exception& e) {
917                 m->errorOut(e, "ScreenSeqsCommand", "screenContigs");
918                 exit(1);
919         }
920 }
921 //***************************************************************************************************************/
922 int ScreenSeqsCommand::screenSummary(map<string, string>& badSeqNames){
923         try{
924         map<string, string> variables; 
925         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile));
926         string outSummary =  getOutputFileName("summary",variables);
927                 outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary);
928         
929         string name;
930         int start, end, length, ambigs, polymer, numReps;
931         
932         ofstream out;
933         m->openOutputFile(outSummary, out);
934                 
935         //read summary file
936         ifstream in;
937         m->openInputFile(summaryfile, in);
938         out << (m->getline(in)) << endl;   //skip headers
939          
940                 int count = 0;
941         
942                 while (!in.eof()) {
943             
944             if (m->control_pressed) { in.close(); out.close(); return 0; }
945             
946             //seqname   start   end     nbases  ambigs  polymer numSeqs
947             in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
948             
949             bool goodSeq = 1;           //      innocent until proven guilty
950             string trashCode = "";
951             if(startPos != -1 && startPos < start)                      {       goodSeq = 0;    trashCode += "start|"; }
952             if(endPos != -1 && endPos > end)                            {       goodSeq = 0;    trashCode += "end|"; }
953             if(maxAmbig != -1 && maxAmbig <     ambigs)         {       goodSeq = 0;    trashCode += "ambig|"; }
954             if(maxHomoP != -1 && maxHomoP < polymer)        {   goodSeq = 0;    trashCode += "homop|"; }
955             if(minLength != -1 && minLength > length)           {       goodSeq = 0;    trashCode += "<length|"; }
956             if(maxLength != -1 && maxLength < length)           {       goodSeq = 0;    trashCode += ">length|"; }
957             
958             if(goodSeq == 1){
959                 out << name << '\t' << start  << '\t' << end  << '\t' << length  << '\t' << ambigs  << '\t' << polymer  << '\t' << numReps << endl;     
960             }
961             else{ badSeqNames[name] = trashCode; }
962             count++;
963         }
964         in.close();
965         out.close();
966         
967         int oldBadSeqsCount = badSeqNames.size();
968         
969         int numFastaSeqs = runFastaScreening(badSeqNames);
970         
971         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
972             m->renameFile(outSummary, outSummary+".temp");
973             
974             ofstream out2;
975             m->openOutputFile(outSummary, out2);
976             
977             //read summary file
978             ifstream in2;
979             m->openInputFile(outSummary+".temp", in2);
980             out2 << (m->getline(in2)) << endl;   //skip headers
981             
982             while (!in2.eof()) {
983                 
984                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
985                 
986                 //seqname       start   end     nbases  ambigs  polymer numSeqs
987                 in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2);
988                 
989                 if (badSeqNames.count(name) == 0) { //are you good?
990                     out2 << name << '\t' << start  << '\t' << end  << '\t' << length  << '\t' << ambigs  << '\t' << polymer  << '\t' << numReps << endl;        
991                 }
992             }
993             in2.close();
994             out2.close();
995             m->mothurRemove(outSummary+".temp");
996         }
997         
998         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; }
999         
1000         
1001         
1002         return count;
1003     }
1004         catch(exception& e) {
1005                 m->errorOut(e, "ScreenSeqsCommand", "screenSummary");
1006                 exit(1);
1007         }
1008 }
1009 //***************************************************************************************************************/
1010 int ScreenSeqsCommand::screenFasta(map<string, string>& badSeqNames){
1011         try{
1012         
1013         
1014         //if the user want to optimize we need to know the 90% mark
1015                 vector<unsigned long long> positions;
1016                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
1017                         //use the namefile to optimize correctly
1018                         if (namefile != "") { nameMap = m->readNames(namefile); }
1019             else if (countfile != "") {
1020                 CountTable ct;
1021                 ct.readTable(countfile);
1022                 nameMap = ct.getNameMap();
1023             }
1024                         getSummary(positions); 
1025                 }else { 
1026 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1027             positions = m->divideFile(fastafile, processors);
1028             for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
1029 #else 
1030             if(processors == 1){ lines.push_back(linePair(0, 1000));  }
1031             else {
1032                 int numFastaSeqs = 0;
1033                 positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
1034                 if (positions.size() < processors) { processors = positions.size(); }
1035                 
1036                 //figure out how many sequences you have to process
1037                 int numSeqsPerProcessor = numFastaSeqs / processors;
1038                 for (int i = 0; i < processors; i++) {
1039                     int startIndex =  i * numSeqsPerProcessor;
1040                     if(i == (processors - 1)){  numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
1041                     lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1042                 }
1043             }
1044 #endif
1045                 }
1046         
1047         if (m->control_pressed) { return 0; }
1048         
1049         int numFastaSeqs = runFastaScreening(badSeqNames);
1050         
1051         return numFastaSeqs;
1052         
1053     }
1054         catch(exception& e) {
1055                 m->errorOut(e, "ScreenSeqsCommand", "screenFasta");
1056                 exit(1);
1057         }
1058 }       
1059 //***************************************************************************************************************
1060
1061 int ScreenSeqsCommand::screenNameGroupFile(map<string, string> badSeqNames){
1062         try {
1063                 ifstream inputNames;
1064                 m->openInputFile(namefile, inputNames);
1065                 map<string, string> badSeqGroups;
1066                 string seqName, seqList, group;
1067                 map<string, string>::iterator it;
1068         map<string, string> variables; 
1069                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
1070         variables["[extension]"] = m->getExtension(namefile);
1071                 string goodNameFile = getOutputFileName("name", variables);
1072                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
1073                 
1074                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
1075                 
1076                 while(!inputNames.eof()){
1077                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); m->mothurRemove(goodNameFile);  return 0; }
1078
1079                         inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList;
1080                         it = badSeqNames.find(seqName);
1081                                 
1082                         if(it != badSeqNames.end()){
1083                                 if(namefile != ""){
1084                                         int start = 0;
1085                                         for(int i=0;i<seqList.length();i++){
1086                                                 if(seqList[i] == ','){
1087                                                         badSeqGroups[seqList.substr(start,i-start)] = it->second;
1088                                                         start = i+1;
1089                                                 }                                       
1090                                         }
1091                                         badSeqGroups[seqList.substr(start,seqList.length()-start)] = it->second;
1092                                 }
1093                 badSeqNames.erase(it);
1094                         }
1095                         else{
1096                                 goodNameOut << seqName << '\t' << seqList << endl;
1097                         }
1098                         m->gobble(inputNames);
1099                 }
1100                 inputNames.close();
1101                 goodNameOut.close();
1102         
1103                 //we were unable to remove some of the bad sequences
1104                 if (badSeqNames.size() != 0) {
1105                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1106                                 m->mothurOut("Your namefile does not include the sequence " + it->first + " please correct."); 
1107                                 m->mothurOutEndLine();
1108                         }
1109                 }
1110
1111                 if(groupfile != ""){
1112                         
1113                         ifstream inputGroups;
1114                         m->openInputFile(groupfile, inputGroups);
1115             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
1116             variables["[extension]"] = m->getExtension(groupfile);
1117             string goodGroupFile = getOutputFileName("group", variables);
1118                         
1119                         outputNames.push_back(goodGroupFile);   outputTypes["group"].push_back(goodGroupFile);
1120                         
1121                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
1122                         
1123                         while(!inputGroups.eof()){
1124                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile);  m->mothurRemove(goodGroupFile); return 0; }
1125
1126                                 inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
1127                                 
1128                                 it = badSeqGroups.find(seqName);
1129                                 
1130                                 if(it != badSeqGroups.end()){
1131                                         badSeqGroups.erase(it);
1132                                 }
1133                                 else{
1134                                         goodGroupOut << seqName << '\t' << group << endl;
1135                                 }
1136                                 m->gobble(inputGroups);
1137                         }
1138                         inputGroups.close();
1139                         goodGroupOut.close();
1140                         
1141                         //we were unable to remove some of the bad sequences
1142                         if (badSeqGroups.size() != 0) {
1143                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
1144                                         m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); 
1145                                         m->mothurOutEndLine();
1146                                 }
1147                         }
1148                 }
1149                 
1150                 
1151                 return 0;
1152         
1153         }
1154         catch(exception& e) {
1155                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
1156                 exit(1);
1157         }
1158 }
1159 //***************************************************************************************************************
1160 int ScreenSeqsCommand::getSummaryReport(){
1161         try {
1162                 
1163                 vector<int> startPosition;
1164                 vector<int> endPosition;
1165                 vector<int> seqLength;
1166                 vector<int> ambigBases;
1167                 vector<int> longHomoPolymer;
1168         
1169 #ifdef USE_MPI
1170                 int pid;
1171                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1172                 
1173                 if (pid == 0) { 
1174 #endif
1175             
1176             
1177             //read summary file
1178             ifstream in;
1179             m->openInputFile(summaryfile, in);
1180             m->getline(in);
1181             
1182             string name;
1183             int start, end, length, ambigs, polymer, numReps;
1184             
1185             while (!in.eof()) {
1186                 
1187                 if (m->control_pressed) { in.close(); return 0; }
1188                 
1189                 //seqname       start   end     nbases  ambigs  polymer numSeqs
1190                 in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
1191                 
1192                 int num = 1;
1193                                 if ((namefile != "") || (countfile !="")) {
1194                                         //make sure this sequence is in the namefile, else error 
1195                                         map<string, int>::iterator it = nameMap.find(name);
1196                                         
1197                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1198                                         else { num = it->second; }
1199                                 }
1200                                 
1201                                 //for each sequence this sequence represents
1202                                 for (int i = 0; i < num; i++) {
1203                                         startPosition.push_back(start);
1204                                         endPosition.push_back(end);
1205                                         seqLength.push_back(length);
1206                                         ambigBases.push_back(ambigs);
1207                                         longHomoPolymer.push_back(polymer);
1208                                 }
1209                
1210             }
1211             in.close();
1212
1213         sort(startPosition.begin(), startPosition.end());
1214                 sort(endPosition.begin(), endPosition.end());
1215                 sort(seqLength.begin(), seqLength.end());
1216                 sort(ambigBases.begin(), ambigBases.end());
1217                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
1218                 
1219                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1220                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
1221                 
1222                 for (int i = 0; i < optimize.size(); i++) {
1223                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
1224                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
1225                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
1226                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
1227                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
1228                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
1229                 }
1230         
1231 #ifdef USE_MPI
1232     }
1233     
1234     MPI_Status status; 
1235     MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1236     MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1237     
1238     if (pid == 0) { 
1239         //send file positions to all processes
1240         for(int i = 1; i < processors; i++) { 
1241             MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1242             MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1243             MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1244             MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1245             MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1246             MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1247         }
1248     }else {
1249         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1250         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1251         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1252         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1253         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1254         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1255     }
1256     MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1257 #endif
1258         return 0;
1259         
1260     }
1261         catch(exception& e) {
1262                 m->errorOut(e, "ScreenSeqsCommand", "getSummaryReport");
1263                 exit(1);
1264         }
1265 }
1266 //***************************************************************************************************************
1267 int ScreenSeqsCommand::optimizeContigs(){
1268         try {
1269                 vector<int> olengths;
1270                 vector<int> oStarts;
1271                 vector<int> oEnds;
1272                 vector<int> numMismatches;
1273         vector<int> numNs;
1274                 
1275         vector<unsigned long long> positions;
1276         vector<linePair> contigsLines;
1277 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1278                 positions = m->divideFilePerLine(contigsreport, processors);
1279                 for (int i = 0; i < (positions.size()-1); i++) { contigsLines.push_back(linePair(positions[i], positions[(i+1)])); }    
1280 #else
1281                 if(processors == 1){ contigsLines.push_back(linePair(0, 1000));  }
1282         else {
1283             int numContigsSeqs = 0;
1284             positions = m->setFilePosEachLine(contigsreport, numContigsSeqs); 
1285             if (positions.size() < processors) { processors = positions.size(); }
1286             
1287             //figure out how many sequences you have to process
1288             int numSeqsPerProcessor = numContigsSeqs / processors;
1289             for (int i = 0; i < processors; i++) {
1290                 int startIndex =  i * numSeqsPerProcessor;
1291                 if(i == (processors - 1)){      numSeqsPerProcessor = numContigsSeqs - i * numSeqsPerProcessor;         }
1292                 contigsLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1293             }
1294         }
1295 #endif
1296                 
1297 #ifdef USE_MPI
1298                 int pid;
1299                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1300                 
1301                 if (pid == 0) { 
1302                         driverContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines[0]);
1303 #else
1304             createProcessesContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines); 
1305             
1306                         if (m->control_pressed) {  return 0; }
1307 #endif
1308             sort(olengths.begin(), olengths.end());
1309             sort(oStarts.begin(), oStarts.end());
1310             sort(oEnds.begin(), oEnds.end());
1311             sort(numMismatches.begin(), numMismatches.end());
1312             sort(numNs.begin(), numNs.end());
1313             
1314             //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1315             int criteriaPercentile      = int(oStarts.size() * (criteria / (float) 100));
1316             
1317             for (int i = 0; i < optimize.size(); i++) {
1318                 if (optimize[i] == "ostart") { oStart = oStarts[criteriaPercentile]; m->mothurOut("Optimizing ostart to " + toString(oStart) + "."); m->mothurOutEndLine(); }
1319                 else if (optimize[i] == "oend") { int endcriteriaPercentile = int(oEnds.size() * ((100 - criteria) / (float) 100));  oEnd = oEnds[endcriteriaPercentile]; m->mothurOut("Optimizing oend to " + toString(oEnd) + "."); m->mothurOutEndLine();}
1320                 else if (optimize[i] == "mismatches") { mismatches = numMismatches[criteriaPercentile]; m->mothurOut("Optimizing mismatches to " + toString(mismatches) + "."); m->mothurOutEndLine(); }
1321                 else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
1322                 else if (optimize[i] == "minoverlap") { int mincriteriaPercentile = int(olengths.size() * ((100 - criteria) / (float) 100)); minOverlap = olengths[mincriteriaPercentile]; m->mothurOut("Optimizing minoverlap to " + toString(minOverlap) + "."); m->mothurOutEndLine(); }
1323
1324             }
1325             
1326 #ifdef USE_MPI
1327                 }
1328                 
1329                 MPI_Status status; 
1330                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1331                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1332         
1333                 if (pid == 0) { 
1334                         //send file positions to all processes
1335                         for(int i = 1; i < processors; i++) { 
1336                 MPI_Send(&minOverlap, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1337                                 MPI_Send(&oStart, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1338                                 MPI_Send(&oEnd, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1339                                 MPI_Send(&mismatches, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1340                 MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1341                         }
1342                 }else {
1343             MPI_Recv(&minOverlap, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1344                         MPI_Recv(&oStart, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1345                         MPI_Recv(&oEnd, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1346                         MPI_Recv(&mismatches, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1347             MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1348                 }
1349                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1350 #endif
1351                 return 0;
1352         }
1353         catch(exception& e) {
1354                 m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
1355                 exit(1);
1356         }
1357 }
1358 /**************************************************************************************/
1359 int ScreenSeqsCommand::driverContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, linePair filePos) {     
1360         try {
1361                 
1362         string name;
1363         //Name  Length  Overlap_Length  Overlap_Start   Overlap_End     MisMatches      Num_Ns
1364         int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
1365         
1366                 ifstream in;
1367                 m->openInputFile(contigsreport, in);
1368         
1369                 in.seekg(filePos.start);
1370         if (filePos.start == 0) { //read headers
1371             m->getline(in); m->gobble(in);
1372         }
1373         
1374                 bool done = false;
1375                 int count = 0;
1376         
1377                 while (!done) {
1378             
1379                         if (m->control_pressed) { in.close(); return 1; }
1380             
1381             //seqname   start   end     nbases  ambigs  polymer numSeqs
1382             in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; m->gobble(in);
1383             
1384             int num = 1;
1385             if ((namefile != "") || (countfile !="")){
1386                 //make sure this sequence is in the namefile, else error 
1387                 map<string, int>::iterator it = nameMap.find(name);
1388                 
1389                 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1390                 else { num = it->second; }
1391             }
1392             
1393             //for each sequence this sequence represents
1394             for (int i = 0; i < num; i++) {
1395                 ostartPosition.push_back(thisOStart);
1396                 oendPosition.push_back(thisOEnd);
1397                 oLength.push_back(OLength);
1398                 omismatches.push_back(numMisMatches);
1399                 numNs.push_back(numns);
1400             }
1401             
1402             count++;
1403                         
1404                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1405 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1406             unsigned long long pos = in.tellg();
1407             if ((pos == -1) || (pos >= filePos.end)) { break; }
1408 #else
1409             if (in.eof()) { break; }
1410 #endif
1411                 }
1412                 
1413                 in.close();
1414                 
1415                 return count;
1416         }
1417         catch(exception& e) {
1418                 m->errorOut(e, "ScreenSeqsCommand", "driverContigsSummary");
1419                 exit(1);
1420         }
1421 }
1422
1423 /**************************************************************************************************/
1424 int ScreenSeqsCommand::createProcessesContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, vector<linePair> contigsLines) {
1425         try {
1426         
1427         int process = 1;
1428                 int num = 0;
1429                 vector<int> processIDS;
1430         
1431 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1432         
1433                 //loop through and create all the processes you want
1434                 while (process != processors) {
1435                         int pid = fork();
1436                         
1437                         if (pid > 0) {
1438                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1439                                 process++;
1440                         }else if (pid == 0){
1441                                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[process]);
1442                                 
1443                                 //pass numSeqs to parent
1444                                 ofstream out;
1445                                 string tempFile = contigsreport + toString(getpid()) + ".num.temp";
1446                                 m->openOutputFile(tempFile, out);
1447                                 
1448                                 out << num << endl;
1449                                 out << ostartPosition.size() << endl;
1450                                 for (int k = 0; k < ostartPosition.size(); k++)         {               out << ostartPosition[k] << '\t';   }  out << endl;
1451                                 for (int k = 0; k < oendPosition.size(); k++)           {               out << oendPosition[k] << '\t';     }  out << endl;
1452                                 for (int k = 0; k < oLength.size(); k++)                        {               out << oLength[k] << '\t';          }  out << endl;
1453                                 for (int k = 0; k < omismatches.size(); k++)        {           out << omismatches[k] << '\t';      }  out << endl;
1454                 for (int k = 0; k < numNs.size(); k++)              {           out << numNs[k] << '\t';            }  out << endl;
1455                                 
1456                                 out.close();
1457                                 
1458                                 exit(0);
1459                         }else { 
1460                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1461                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1462                                 exit(0);
1463                         }
1464                 }
1465                 
1466                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[0]);
1467                 
1468                 //force parent to wait until all the processes are done
1469                 for (int i=0;i<processIDS.size();i++) { 
1470                         int temp = processIDS[i];
1471                         wait(&temp);
1472                 }
1473                 
1474                 //parent reads in and combine Filter info
1475                 for (int i = 0; i < processIDS.size(); i++) {
1476                         string tempFilename = contigsreport + toString(processIDS[i]) + ".num.temp";
1477                         ifstream in;
1478                         m->openInputFile(tempFilename, in);
1479                         
1480                         int temp, tempNum;
1481                         in >> tempNum; m->gobble(in); num += tempNum;
1482                         in >> tempNum; m->gobble(in);
1483                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ostartPosition.push_back(temp);             }               m->gobble(in);
1484                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; oendPosition.push_back(temp);               }               m->gobble(in);
1485                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; oLength.push_back(temp);                    }               m->gobble(in);
1486                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; omismatches.push_back(temp);        }               m->gobble(in);
1487             for (int k = 0; k < tempNum; k++)                   {               in >> temp; numNs.push_back(temp);              }               m->gobble(in);
1488             
1489                         in.close();
1490                         m->mothurRemove(tempFilename);
1491                 }
1492                 
1493                 
1494 #else 
1495         //////////////////////////////////////////////////////////////////////////////////////////////////////
1496                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
1497                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1498                 //Taking advantage of shared memory to allow both threads to add info to vectors.
1499                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1500                 /*
1501                 vector<contigsSumData*> pDataArray; 
1502                 DWORD   dwThreadIdArray[processors-1];
1503                 HANDLE  hThreadArray[processors-1]; 
1504                 
1505                 //Create processor worker threads.
1506                 for( int i=0; i<processors-1; i++ ){
1507             
1508                         // Allocate memory for thread data.
1509                         contigsSumData* tempSum = new contigsSumData(contigsreport, m, contigsLines[i].start, contigsLines[i].end, namefile, countfile, nameMap);
1510                         pDataArray.push_back(tempSum);
1511                         
1512                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1513                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1514                         hThreadArray[i] = CreateThread(NULL, 0, MyContigsSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1515                 }
1516                 */
1517         contigsLines[processors-1].start = 0;
1518         //do your part
1519                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[processors-1]);
1520         /*
1521                 //Wait until all threads have terminated.
1522                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1523                 
1524                 //Close all thread handles and free memory allocations.
1525                 for(int i=0; i < pDataArray.size(); i++){
1526                         num += pDataArray[i]->count;
1527             if (pDataArray[i]->count != pDataArray[i]->end) {
1528                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
1529             }
1530             for (int k = 0; k < pDataArray[i]->ostartPosition.size(); k++)  {   ostartPosition.push_back(pDataArray[i]->ostartPosition[k]);     }
1531                         for (int k = 0; k < pDataArray[i]->oendPosition.size(); k++)    {       oendPosition.push_back(pDataArray[i]->oendPosition[k]);         }
1532             for (int k = 0; k < pDataArray[i]->oLength.size(); k++)         {   oLength.push_back(pDataArray[i]->oLength[k]);                   }
1533             for (int k = 0; k < pDataArray[i]->omismatches.size(); k++)     {   omismatches.push_back(pDataArray[i]->omismatches[k]);           }
1534             for (int k = 0; k < pDataArray[i]->numNs.size(); k++)           {   numNs.push_back(pDataArray[i]->numNs[k]);                       }
1535                         CloseHandle(hThreadArray[i]);
1536                         delete pDataArray[i];
1537                 }
1538         */
1539 #endif          
1540         return num;
1541         }
1542         catch(exception& e) {
1543                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesContigsSummary");
1544                 exit(1);
1545         }
1546 }
1547 //***************************************************************************************************************
1548 int ScreenSeqsCommand::optimizeAlign(){
1549         try {
1550         
1551                 vector<float> sims;
1552                 vector<float> scores;
1553                 vector<int> inserts;
1554                 
1555         vector<unsigned long long> positions;
1556         vector<linePair> alignLines;
1557 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1558                 positions = m->divideFilePerLine(alignreport, processors);
1559                 for (int i = 0; i < (positions.size()-1); i++) { alignLines.push_back(linePair(positions[i], positions[(i+1)])); }      
1560 #else
1561                 if(processors == 1){ alignLines.push_back(linePair(0, 1000));  }
1562         else {
1563             int numAlignSeqs = 0;
1564             positions = m->setFilePosEachLine(alignreport, numAlignSeqs); 
1565             if (positions.size() < processors) { processors = positions.size(); }
1566             
1567             //figure out how many sequences you have to process
1568             int numSeqsPerProcessor = numAlignSeqs / processors;
1569             for (int i = 0; i < processors; i++) {
1570                 int startIndex =  i * numSeqsPerProcessor;
1571                 if(i == (processors - 1)){      numSeqsPerProcessor = numAlignSeqs - i * numSeqsPerProcessor;   }
1572                 alignLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1573             }
1574         }
1575 #endif
1576                 
1577 #ifdef USE_MPI
1578                 int pid;
1579                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1580                 
1581                 if (pid == 0) { 
1582                         driverAlignSummary(sims, scores, inserts, alignLines[0]);
1583 #else
1584             createProcessesAlignSummary(sims, scores, inserts, alignLines); 
1585             
1586                         if (m->control_pressed) {  return 0; }
1587 #endif
1588             sort(sims.begin(), sims.end());
1589             sort(scores.begin(), scores.end());
1590             sort(inserts.begin(), inserts.end());
1591             
1592             //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1593             int criteriaPercentile      = int(sims.size() * (criteria / (float) 100));
1594             
1595             for (int i = 0; i < optimize.size(); i++) {
1596                 if (optimize[i] == "minsim") { int mincriteriaPercentile = int(sims.size() * ((100 - criteria) / (float) 100)); minSim = sims[mincriteriaPercentile];  m->mothurOut("Optimizing minsim to " + toString(minSim) + "."); m->mothurOutEndLine();}
1597                 else if (optimize[i] == "minscore") { int mincriteriaPercentile = int(scores.size() * ((100 - criteria) / (float) 100)); minScore = scores[mincriteriaPercentile];  m->mothurOut("Optimizing minscore to " + toString(minScore) + "."); m->mothurOutEndLine(); }
1598                 else if (optimize[i] == "maxinsert") { maxInsert = inserts[criteriaPercentile]; m->mothurOut("Optimizing maxinsert to " + toString(maxInsert) + "."); m->mothurOutEndLine(); }
1599             }
1600             
1601 #ifdef USE_MPI
1602                 }
1603                 
1604                 MPI_Status status; 
1605                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1606                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1607         
1608                 if (pid == 0) { 
1609                         //send file positions to all processes
1610                         for(int i = 1; i < processors; i++) { 
1611                 MPI_Send(&minSim, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1612                                 MPI_Send(&minScore, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1613                                 MPI_Send(&maxInsert, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1614                         }
1615                 }else {
1616             MPI_Recv(&minSim, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1617                         MPI_Recv(&minScore, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1618                         MPI_Recv(&maxInsert, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1619                 }
1620                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1621 #endif
1622                 return 0;
1623         }
1624         catch(exception& e) {
1625                 m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
1626                 exit(1);
1627         }
1628 }
1629 /**************************************************************************************/
1630 int ScreenSeqsCommand::driverAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, linePair filePos) { 
1631         try {
1632                 
1633         string name, TemplateName, SearchMethod, AlignmentMethod;
1634         //QueryName     QueryLength     TemplateName    TemplateLength  SearchMethod    SearchScore     AlignmentMethod QueryStart      QueryEnd        TemplateStart   TemplateEnd     PairwiseAlignmentLength GapsInQuery     GapsInTemplate  LongestInsert   SimBtwnQuery&Template
1635         //checking for minScore, maxInsert, minSim
1636         int length, TemplateLength,      QueryStart,    QueryEnd,       TemplateStart,  TemplateEnd,    PairwiseAlignmentLength,        GapsInQuery,    GapsInTemplate, LongestInsert;
1637         float SearchScore, SimBtwnQueryTemplate;
1638          
1639                 ifstream in;
1640                 m->openInputFile(alignreport, in);
1641         
1642                 in.seekg(filePos.start);
1643         if (filePos.start == 0) { //read headers
1644             m->getline(in); m->gobble(in);
1645         }
1646         
1647                 bool done = false;
1648                 int count = 0;
1649         
1650                 while (!done) {
1651             
1652                         if (m->control_pressed) { in.close(); return 1; }
1653             
1654             in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
1655             
1656             int num = 1;
1657             if ((namefile != "") || (countfile !="")){
1658                 //make sure this sequence is in the namefile, else error 
1659                 map<string, int>::iterator it = nameMap.find(name);
1660                 
1661                 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1662                 else { num = it->second; }
1663             }
1664             
1665             //for each sequence this sequence represents
1666             for (int i = 0; i < num; i++) {
1667                 sims.push_back(SimBtwnQueryTemplate);
1668                 scores.push_back(SearchScore);
1669                 inserts.push_back(LongestInsert);
1670             }
1671             
1672             count++;
1673                         
1674                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1675 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1676             unsigned long long pos = in.tellg();
1677             if ((pos == -1) || (pos >= filePos.end)) { break; }
1678 #else
1679             if (in.eof()) { break; }
1680 #endif
1681                 }
1682                 
1683                 in.close();
1684                 
1685                 return count;
1686         }
1687         catch(exception& e) {
1688                 m->errorOut(e, "ScreenSeqsCommand", "driverAlignSummary");
1689                 exit(1);
1690         }
1691 }
1692
1693 /**************************************************************************************************/
1694 int ScreenSeqsCommand::createProcessesAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, vector<linePair> alignLines) {
1695         try {
1696         
1697         int process = 1;
1698                 int num = 0;
1699                 vector<int> processIDS;
1700         
1701 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1702         
1703                 //loop through and create all the processes you want
1704                 while (process != processors) {
1705                         int pid = fork();
1706                         
1707                         if (pid > 0) {
1708                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1709                                 process++;
1710                         }else if (pid == 0){
1711                                 num = driverAlignSummary(sims, scores, inserts, alignLines[process]);
1712                                 
1713                                 //pass numSeqs to parent
1714                                 ofstream out;
1715                                 string tempFile = alignreport + toString(getpid()) + ".num.temp";
1716                                 m->openOutputFile(tempFile, out);
1717                                 
1718                                 out << num << endl;
1719                                 out << sims.size() << endl;
1720                                 for (int k = 0; k < sims.size(); k++)           {               out << sims[k] << '\t';         }  out << endl;
1721                                 for (int k = 0; k < scores.size(); k++)         {               out << scores[k] << '\t';       }  out << endl;
1722                                 for (int k = 0; k < inserts.size(); k++)        {               out << inserts[k] << '\t';      }  out << endl;
1723                                 
1724                                 out.close();
1725                                 
1726                                 exit(0);
1727                         }else { 
1728                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1729                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1730                                 exit(0);
1731                         }
1732                 }
1733                 
1734                 num = driverAlignSummary(sims, scores, inserts, alignLines[0]);
1735                 
1736                 //force parent to wait until all the processes are done
1737                 for (int i=0;i<processIDS.size();i++) { 
1738                         int temp = processIDS[i];
1739                         wait(&temp);
1740                 }
1741                 
1742                 //parent reads in and combine Filter info
1743                 for (int i = 0; i < processIDS.size(); i++) {
1744                         string tempFilename = alignreport + toString(processIDS[i]) + ".num.temp";
1745                         ifstream in;
1746                         m->openInputFile(tempFilename, in);
1747                         
1748                         int temp, tempNum;
1749             float temp2;
1750                         in >> tempNum; m->gobble(in); num += tempNum;
1751                         in >> tempNum; m->gobble(in);
1752                         for (int k = 0; k < tempNum; k++)                       {               in >> temp2; sims.push_back(temp2);             }               m->gobble(in);
1753                         for (int k = 0; k < tempNum; k++)                       {               in >> temp2; scores.push_back(temp2);           }               m->gobble(in);
1754                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; inserts.push_back(temp);    }               m->gobble(in);
1755                           
1756                         in.close();
1757                         m->mothurRemove(tempFilename);
1758                 }
1759                 
1760                 
1761 #else 
1762         //////////////////////////////////////////////////////////////////////////////////////////////////////
1763                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
1764                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1765                 //Taking advantage of shared memory to allow both threads to add info to vectors.
1766                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1767                 /*
1768                 vector<alignsData*> pDataArray; 
1769                 DWORD   dwThreadIdArray[processors-1];
1770                 HANDLE  hThreadArray[processors-1]; 
1771                 
1772                 //Create processor worker threads.
1773                 for( int i=0; i<processors-1; i++ ){
1774             
1775                         // Allocate memory for thread data.
1776                         alignsData* tempSum = new alignsData(alignreport, m, alignLines[i].start, alignLines[i].end, namefile, countfile, nameMap);
1777                         pDataArray.push_back(tempSum);
1778                         
1779                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1780                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1781                         hThreadArray[i] = CreateThread(NULL, 0, MyAlignsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1782                 }*/
1783                 alignLines[processors-1].start = 0;
1784         //do your part
1785                 num = driverAlignSummary(sims, scores, inserts, alignLines[processors-1]);
1786        /*
1787                 //Wait until all threads have terminated.
1788                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1789                 
1790                 //Close all thread handles and free memory allocations.
1791                 for(int i=0; i < pDataArray.size(); i++){
1792                         num += pDataArray[i]->count;
1793             if (pDataArray[i]->count != pDataArray[i]->end) {
1794                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
1795             }
1796             for (int k = 0; k < pDataArray[i]->sims.size(); k++)        {       sims.push_back(pDataArray[i]->sims[k]);         }
1797                         for (int k = 0; k < pDataArray[i]->scores.size(); k++)      {   scores.push_back(pDataArray[i]->scores[k]);     }
1798             for (int k = 0; k < pDataArray[i]->inserts.size(); k++)     {       inserts.push_back(pDataArray[i]->inserts[k]);   }
1799                 CloseHandle(hThreadArray[i]);
1800                         delete pDataArray[i];
1801                 }
1802         */
1803 #endif          
1804         return num;
1805         }
1806         catch(exception& e) {
1807                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesAlignSummary");
1808                 exit(1);
1809         }
1810 }
1811 //***************************************************************************************************************
1812 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
1813         try {
1814                 
1815                 vector<int> startPosition;
1816                 vector<int> endPosition;
1817                 vector<int> seqLength;
1818                 vector<int> ambigBases;
1819                 vector<int> longHomoPolymer;
1820         vector<int> numNs;
1821                 
1822         vector<unsigned long long> positions;
1823 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1824                 positions = m->divideFile(fastafile, processors);
1825                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }   
1826 #else
1827                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
1828         else {
1829             int numFastaSeqs = 0;
1830             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
1831             if (positions.size() < processors) { processors = positions.size(); }
1832             
1833             //figure out how many sequences you have to process
1834             int numSeqsPerProcessor = numFastaSeqs / processors;
1835             for (int i = 0; i < processors; i++) {
1836                 int startIndex =  i * numSeqsPerProcessor;
1837                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
1838                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1839             }
1840         }
1841 #endif
1842                 
1843 #ifdef USE_MPI
1844                 int pid;
1845                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1846                 
1847                 if (pid == 0) { 
1848                         driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
1849 #else
1850                 int numSeqs = 0;
1851                 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1852                         if(processors == 1){
1853                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
1854                         }else{
1855                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile); 
1856                         }
1857                                 
1858                         if (m->control_pressed) {  return 0; }
1859 #endif
1860                 sort(startPosition.begin(), startPosition.end());
1861                 sort(endPosition.begin(), endPosition.end());
1862                 sort(seqLength.begin(), seqLength.end());
1863                 sort(ambigBases.begin(), ambigBases.end());
1864                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
1865         sort(numNs.begin(), numNs.end());
1866                 
1867                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1868                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
1869                 
1870                 for (int i = 0; i < optimize.size(); i++) {
1871                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
1872                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
1873                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
1874                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
1875                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
1876                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
1877             else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
1878                 }
1879
1880 #ifdef USE_MPI
1881                 }
1882                 
1883                 MPI_Status status; 
1884                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1885                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1886                         
1887                 if (pid == 0) { 
1888                         //send file positions to all processes
1889                         for(int i = 1; i < processors; i++) { 
1890                                 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1891                                 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1892                                 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1893                                 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1894                                 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1895                                 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1896                 MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1897                         }
1898                 }else {
1899                         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1900                         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1901                         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1902                         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1903                         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1904                         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1905             MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1906                 }
1907                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1908 #endif
1909                 return 0;
1910         }
1911         catch(exception& e) {
1912                 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
1913                 exit(1);
1914         }
1915 }
1916 /**************************************************************************************/
1917 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename, linePair filePos) {        
1918         try {
1919                 
1920                 ifstream in;
1921                 m->openInputFile(filename, in);
1922                                 
1923                 in.seekg(filePos.start);
1924
1925                 bool done = false;
1926                 int count = 0;
1927         
1928                 while (!done) {
1929                                 
1930                         if (m->control_pressed) { in.close(); return 1; }
1931                                         
1932                         Sequence current(in); m->gobble(in);
1933         
1934                         if (current.getName() != "") {
1935                                 int num = 1;
1936                                 if ((namefile != "") || (countfile !="")){
1937                                         //make sure this sequence is in the namefile, else error 
1938                                         map<string, int>::iterator it = nameMap.find(current.getName());
1939                                         
1940                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1941                                         else { num = it->second; }
1942                                 }
1943                                 
1944                                 //for each sequence this sequence represents
1945                 int numns = current.getNumNs();
1946                                 for (int i = 0; i < num; i++) {
1947                                         startPosition.push_back(current.getStartPos());
1948                                         endPosition.push_back(current.getEndPos());
1949                                         seqLength.push_back(current.getNumBases());
1950                                         ambigBases.push_back(current.getAmbigBases());
1951                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
1952                     numNs.push_back(numns);
1953                                 }
1954                                 
1955                                 count++;
1956                         }
1957                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1958                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1959                                 unsigned long long pos = in.tellg();
1960                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
1961                         #else
1962                                 if (in.eof()) { break; }
1963                         #endif
1964                         
1965                 }
1966                 
1967                 in.close();
1968                 
1969                 return count;
1970         }
1971         catch(exception& e) {
1972                 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
1973                 exit(1);
1974         }
1975 }
1976 /**************************************************************************************************/
1977 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename) {
1978         try {
1979         
1980         int process = 1;
1981                 int num = 0;
1982                 vector<int> processIDS;
1983
1984 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1985                                 
1986                 //loop through and create all the processes you want
1987                 while (process != processors) {
1988                         int pid = fork();
1989                         
1990                         if (pid > 0) {
1991                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1992                                 process++;
1993                         }else if (pid == 0){
1994                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[process]);
1995                                 
1996                                 //pass numSeqs to parent
1997                                 ofstream out;
1998                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
1999                                 m->openOutputFile(tempFile, out);
2000                                 
2001                                 out << num << endl;
2002                                 out << startPosition.size() << endl;
2003                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
2004                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
2005                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
2006                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
2007                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
2008                 for (int k = 0; k < numNs.size(); k++)  {               out << numNs[k] << '\t'; }  out << endl;
2009                                 
2010                                 out.close();
2011                                 
2012                                 exit(0);
2013                         }else { 
2014                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2015                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2016                                 exit(0);
2017                         }
2018                 }
2019                 
2020                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
2021                 
2022                 //force parent to wait until all the processes are done
2023                 for (int i=0;i<processIDS.size();i++) { 
2024                         int temp = processIDS[i];
2025                         wait(&temp);
2026                 }
2027                 
2028                 //parent reads in and combine Filter info
2029                 for (int i = 0; i < processIDS.size(); i++) {
2030                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
2031                         ifstream in;
2032                         m->openInputFile(tempFilename, in);
2033                         
2034                         int temp, tempNum;
2035                         in >> tempNum; m->gobble(in); num += tempNum;
2036                         in >> tempNum; m->gobble(in);
2037                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
2038                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
2039                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
2040                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
2041                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
2042             for (int k = 0; k < tempNum; k++)                   {               in >> temp; numNs.push_back(temp);      }               m->gobble(in);
2043                                 
2044                         in.close();
2045                         m->mothurRemove(tempFilename);
2046                 }
2047                 
2048                 
2049 #else 
2050         //////////////////////////////////////////////////////////////////////////////////////////////////////
2051                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
2052                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2053                 //Taking advantage of shared memory to allow both threads to add info to vectors.
2054                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2055                 
2056                 vector<sumData*> pDataArray; 
2057                 DWORD   dwThreadIdArray[processors-1];
2058                 HANDLE  hThreadArray[processors-1]; 
2059                 
2060                 //Create processor worker threads.
2061                 for( int i=0; i<processors-1; i++ ){
2062             
2063                         // Allocate memory for thread data.
2064                         sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, countfile, nameMap);
2065                         pDataArray.push_back(tempSum);
2066                         
2067                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
2068                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
2069                         hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2070                 }
2071                 
2072         //do your part
2073                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[processors-1]);
2074          
2075                 //Wait until all threads have terminated.
2076                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2077                 
2078                 //Close all thread handles and free memory allocations.
2079                 for(int i=0; i < pDataArray.size(); i++){
2080                         num += pDataArray[i]->count;
2081             if (pDataArray[i]->count != pDataArray[i]->end) {
2082                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
2083             }
2084             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
2085                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
2086             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
2087             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
2088             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
2089             for (int k = 0; k < pDataArray[i]->numNs.size(); k++) {     numNs.push_back(pDataArray[i]->numNs[k]);       }
2090                         CloseHandle(hThreadArray[i]);
2091                         delete pDataArray[i];
2092                 }
2093
2094 #endif          
2095         return num;
2096         }
2097         catch(exception& e) {
2098                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
2099                 exit(1);
2100         }
2101 }
2102
2103 //***************************************************************************************************************
2104
2105 int ScreenSeqsCommand::screenGroupFile(map<string, string> badSeqNames){
2106         try {
2107                 ifstream inputGroups;
2108                 m->openInputFile(groupfile, inputGroups);
2109                 string seqName, group;
2110                 map<string, string>::iterator it;
2111                 map<string, string> variables;
2112                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
2113         variables["[extension]"] = m->getExtension(groupfile);
2114         string goodGroupFile = getOutputFileName("group", variables);
2115         outputNames.push_back(goodGroupFile);  outputTypes["group"].push_back(goodGroupFile);
2116                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
2117                 
2118                 while(!inputGroups.eof()){
2119                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
2120
2121                         inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
2122                         it = badSeqNames.find(seqName);
2123                         
2124                         if(it != badSeqNames.end()){
2125                                 badSeqNames.erase(it);
2126                         }
2127                         else{
2128                                 goodGroupOut << seqName << '\t' << group << endl;
2129                         }
2130                         m->gobble(inputGroups);
2131                 }
2132                 
2133                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); m->mothurRemove(goodGroupFile);  return 0; }
2134
2135                 //we were unable to remove some of the bad sequences
2136                 if (badSeqNames.size() != 0) {
2137                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2138                                 m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); 
2139                                 m->mothurOutEndLine();
2140                         }
2141                 }
2142                 
2143                 inputGroups.close();
2144                 goodGroupOut.close();
2145                 
2146                 if (m->control_pressed) { m->mothurRemove(goodGroupFile);   }
2147                 
2148                 return 0;
2149         
2150         }
2151         catch(exception& e) {
2152                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
2153                 exit(1);
2154         }
2155 }
2156 //***************************************************************************************************************
2157 int ScreenSeqsCommand::screenCountFile(map<string, string> badSeqNames){
2158         try {
2159                 ifstream in;
2160                 m->openInputFile(countfile, in);
2161                 map<string, string>::iterator it;
2162                 map<string, string> variables;
2163                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
2164         variables["[extension]"] = m->getExtension(countfile);
2165         string goodCountFile = getOutputFileName("count", variables);
2166                 
2167         outputNames.push_back(goodCountFile);  outputTypes["count"].push_back(goodCountFile);
2168                 ofstream goodCountOut;  m->openOutputFile(goodCountFile, goodCountOut);
2169                 
2170         string headers = m->getline(in); m->gobble(in);
2171         goodCountOut << headers << endl;
2172         
2173         string name, rest; int thisTotal;
2174         while (!in.eof()) {
2175
2176                         if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
2177             
2178                         in >> name; m->gobble(in); 
2179             in >> thisTotal; m->gobble(in);
2180             rest = m->getline(in); m->gobble(in);
2181             
2182                         it = badSeqNames.find(name);
2183                         
2184                         if(it != badSeqNames.end()){
2185                                 badSeqNames.erase(it);
2186                         }
2187                         else{
2188                                 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
2189                         }
2190                 }
2191                 
2192                 if (m->control_pressed) { goodCountOut.close();  in.close(); m->mothurRemove(goodCountFile);  return 0; }
2193         
2194                 //we were unable to remove some of the bad sequences
2195                 if (badSeqNames.size() != 0) {
2196                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2197                                 m->mothurOut("Your count file does not include the sequence " + it->first + " please correct."); 
2198                                 m->mothurOutEndLine();
2199                         }
2200                 }
2201                 
2202                 in.close();
2203                 goodCountOut.close();
2204         
2205         //check for groups that have been eliminated
2206         CountTable ct;
2207         if (ct.testGroups(goodCountFile)) {
2208             ct.readTable(goodCountFile);
2209             ct.printTable(goodCountFile);
2210         }
2211                 
2212                 if (m->control_pressed) { m->mothurRemove(goodCountFile);   }
2213                 
2214                 return 0;
2215         
2216         }
2217         catch(exception& e) {
2218                 m->errorOut(e, "ScreenSeqsCommand", "screenCountFile");
2219                 exit(1);
2220         }
2221 }
2222 //***************************************************************************************************************
2223
2224 int ScreenSeqsCommand::screenTaxonomy(map<string, string> badSeqNames){
2225         try {
2226                 ifstream input;
2227                 m->openInputFile(taxonomy, input);
2228                 string seqName, tax;
2229                 map<string, string>::iterator it;
2230         map<string, string> variables;
2231                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy));
2232         variables["[extension]"] = m->getExtension(taxonomy);
2233         string goodTaxFile = getOutputFileName("taxonomy", variables);
2234
2235                 outputNames.push_back(goodTaxFile);  outputTypes["taxonomy"].push_back(goodTaxFile);
2236                 ofstream goodTaxOut;    m->openOutputFile(goodTaxFile, goodTaxOut);
2237                                 
2238                 while(!input.eof()){
2239                         if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
2240                         
2241                         input >> seqName; m->gobble(input); input >> tax;
2242                         it = badSeqNames.find(seqName);
2243                         
2244                         if(it != badSeqNames.end()){ badSeqNames.erase(it); }
2245                         else{
2246                                 goodTaxOut << seqName << '\t' << tax << endl;
2247                         }
2248                         m->gobble(input);
2249                 }
2250                 
2251                 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
2252                 
2253                 //we were unable to remove some of the bad sequences
2254                 if (badSeqNames.size() != 0) {
2255                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2256                                 m->mothurOut("Your taxonomy file does not include the sequence " + it->first + " please correct."); 
2257                                 m->mothurOutEndLine();
2258                         }
2259                 }
2260                 
2261                 input.close();
2262                 goodTaxOut.close();
2263                 
2264                 if (m->control_pressed) {  m->mothurRemove(goodTaxFile);  return 0; }
2265                 
2266                 return 0;
2267                 
2268         }
2269         catch(exception& e) {
2270                 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
2271                 exit(1);
2272         }
2273         
2274 }
2275 //***************************************************************************************************************
2276
2277 int ScreenSeqsCommand::screenQual(map<string, string> badSeqNames){
2278         try {
2279                 ifstream in;
2280                 m->openInputFile(qualfile, in);
2281                 map<string, string>::iterator it;
2282                 map<string, string> variables;
2283                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile));
2284         variables["[extension]"] = m->getExtension(qualfile);
2285         string goodQualFile = getOutputFileName("qfile", variables);
2286                 
2287                 outputNames.push_back(goodQualFile);  outputTypes["qfile"].push_back(goodQualFile);
2288                 ofstream goodQual;      m->openOutputFile(goodQualFile, goodQual);
2289                 
2290                 while(!in.eof()){       
2291                         
2292                         if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
2293
2294                         string saveName = "";
2295                         string name = "";
2296                         string scores = "";
2297                         
2298                         in >> name; 
2299                         
2300                         if (name.length() != 0) { 
2301                                 saveName = name.substr(1);
2302                                 while (!in.eof())       {       
2303                                         char c = in.get(); 
2304                                         if (c == 10 || c == 13 || c == -1){     break;  }
2305                                         else { name += c; }     
2306                                 } 
2307                                 m->gobble(in);
2308                         }
2309                         
2310                         while(in){
2311                                 char letter= in.get();
2312                                 if(letter == '>'){      in.putback(letter);     break;  }
2313                                 else{ scores += letter; }
2314                         }
2315                         
2316                         m->gobble(in);
2317                         
2318                         it = badSeqNames.find(saveName);
2319                         
2320                         if(it != badSeqNames.end()){
2321                                 badSeqNames.erase(it);
2322                         }else{                          
2323                                 goodQual << name << endl << scores;
2324                         }
2325                         
2326                         m->gobble(in);
2327                 }
2328                 
2329                 in.close();
2330                 goodQual.close();
2331                 
2332                 //we were unable to remove some of the bad sequences
2333                 if (badSeqNames.size() != 0) {
2334                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2335                                 m->mothurOut("Your qual file does not include the sequence " + it->first + " please correct."); 
2336                                 m->mothurOutEndLine();
2337                         }
2338                 }
2339                 
2340                 if (m->control_pressed) {  m->mothurRemove(goodQualFile);  return 0; }
2341                 
2342                 return 0;
2343                 
2344         }
2345         catch(exception& e) {
2346                 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
2347                 exit(1);
2348         }
2349         
2350 }
2351 //**********************************************************************************************************************
2352
2353 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, map<string, string>& badSeqNames){
2354         try {
2355                 ofstream goodFile;
2356                 m->openOutputFile(goodFName, goodFile);
2357                 
2358                 ofstream badAccnosFile;
2359                 m->openOutputFile(badAccnosFName, badAccnosFile);
2360                 
2361                 ifstream inFASTA;
2362                 m->openInputFile(filename, inFASTA);
2363
2364                 inFASTA.seekg(filePos.start);
2365
2366                 bool done = false;
2367                 int count = 0;
2368         
2369                 while (!done) {
2370                 
2371                         if (m->control_pressed) {  return 0; }
2372                         
2373                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
2374                         if (currSeq.getName() != "") {
2375                                 bool goodSeq = 1;               //      innocent until proven guilty
2376                 string trashCode = "";
2377                 //have the report files found you bad
2378                 map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
2379                 if (it != badSeqNames.end()) { goodSeq = 0;  trashCode = it->second; }  
2380                 
2381                 if (summaryfile == "") { //summaryfile includes these so no need to check again
2382                     if(startPos != -1 && startPos < currSeq.getStartPos())                      {       goodSeq = 0;    trashCode += "start|"; }
2383                     if(endPos != -1 && endPos > currSeq.getEndPos())                            {       goodSeq = 0;    trashCode += "end|";}
2384                     if(maxAmbig != -1 && maxAmbig <     currSeq.getAmbigBases())                {       goodSeq = 0;    trashCode += "ambig|";}
2385                     if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())       {       goodSeq = 0;    trashCode += "homop|";}
2386                     if(minLength != -1 && minLength > currSeq.getNumBases())            {       goodSeq = 0;    trashCode += "<length|";}
2387                     if(maxLength != -1 && maxLength < currSeq.getNumBases())            {       goodSeq = 0;    trashCode += ">length|";}
2388                 }
2389                 
2390                 if (contigsreport == "") { //contigs report includes this so no need to check again
2391                     if(maxN != -1 && maxN < currSeq.getNumNs())                     {   goodSeq = 0;    trashCode += "n|"; }
2392                 }
2393                                 
2394                                 if(goodSeq == 1){
2395                                         currSeq.printSequence(goodFile);        
2396                                 }else{
2397                                         badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
2398                                         badSeqNames[currSeq.getName()] = trashCode;
2399                                 }
2400                 count++;
2401                         }
2402                         
2403                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
2404                                 unsigned long long pos = inFASTA.tellg();
2405                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
2406                         #else
2407                                 if (inFASTA.eof()) { break; }
2408                         #endif
2409                         
2410                         //report progress
2411                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
2412                 }
2413                 //report progress
2414                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
2415                 
2416                         
2417                 goodFile.close();
2418                 inFASTA.close();
2419                 badAccnosFile.close();
2420                 
2421                 return count;
2422         }
2423         catch(exception& e) {
2424                 m->errorOut(e, "ScreenSeqsCommand", "driver");
2425                 exit(1);
2426         }
2427 }
2428 //**********************************************************************************************************************
2429 #ifdef USE_MPI
2430 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, map<string, string>& badSeqNames){
2431         try {
2432                 string outputString = "";
2433                 MPI_Status statusGood; 
2434                 MPI_Status statusBadAccnos; 
2435                 MPI_Status status; 
2436                 int pid;
2437                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
2438
2439                 for(int i=0;i<num;i++){
2440                 
2441                         if (m->control_pressed) {  return 0; }
2442                         
2443                         //read next sequence
2444                         int length = MPIPos[start+i+1] - MPIPos[start+i];
2445
2446                         char* buf4 = new char[length];
2447
2448                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
2449                         
2450                         string tempBuf = buf4;  delete buf4;
2451                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
2452                         istringstream iss (tempBuf,istringstream::in);
2453                         
2454                         Sequence currSeq(iss);                  
2455                         
2456                         //process seq
2457                         if (currSeq.getName() != "") {
2458                                 bool goodSeq = 1;               //      innocent until proven guilty
2459                 string trashCode = "";
2460                 //have the report files found you bad
2461                 map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
2462                 if (it != badSeqNames.end()) { goodSeq = 0;  trashCode = it->second; }  
2463                 
2464                 if (summaryfile == "") { //summaryfile includes these so no need to check again
2465                     if(startPos != -1 && startPos < currSeq.getStartPos())                      {       goodSeq = 0;    trashCode += "start|"; }
2466                     if(endPos != -1 && endPos > currSeq.getEndPos())                            {       goodSeq = 0;    trashCode += "end|";}
2467                     if(maxAmbig != -1 && maxAmbig <     currSeq.getAmbigBases())                {       goodSeq = 0;    trashCode += "ambig|";}
2468                     if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())       {       goodSeq = 0;    trashCode += "homop|";}
2469                     if(minLength != -1 && minLength > currSeq.getNumBases())            {       goodSeq = 0;    trashCode += "<length|";}
2470                     if(maxLength != -1 && maxLength < currSeq.getNumBases())            {       goodSeq = 0;    trashCode += ">length|";}
2471                 }
2472                 
2473                 if (contigsreport == "") { //contigs report includes this so no need to check again
2474                     if(maxN != -1 && maxN < currSeq.getNumNs())                     {   goodSeq = 0;    trashCode += "n|"; }
2475                 }
2476                                 
2477                 
2478                                 if(goodSeq == 1){
2479                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
2480                                 
2481                                         //print good seq
2482                                         length = outputString.length();
2483                                         char* buf2 = new char[length];
2484                                         memcpy(buf2, outputString.c_str(), length);
2485                                         
2486                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
2487                                         delete buf2;
2488                                 }
2489                                 else{
2490
2491                                         badSeqNames[currSeq.getName()] = trashCode;
2492                                         
2493                                         //write to bad accnos file
2494                                         outputString = currSeq.getName() + "\t" + trashCode.substr(0, trashCode.length()-1) + "\n";
2495                                 
2496                                         length = outputString.length();
2497                                         char* buf3 = new char[length];
2498                                         memcpy(buf3, outputString.c_str(), length);
2499                                         
2500                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
2501                                         delete buf3;
2502                                 }
2503                         }
2504                         
2505                         //report progress
2506                         if((i) % 100 == 0){     m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine();             }
2507                 }
2508                                 
2509                 return 1;
2510         }
2511         catch(exception& e) {
2512                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
2513                 exit(1);
2514         }
2515 }
2516 #endif
2517 /**************************************************************************************************/
2518
2519 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, map<string, string>& badSeqNames) {
2520         try {
2521         
2522         vector<int> processIDS;   
2523         int process = 1;
2524                 int num = 0;
2525
2526 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
2527                                 
2528                 //loop through and create all the processes you want
2529                 while (process != processors) {
2530                         int pid = fork();
2531                         
2532                         if (pid > 0) {
2533                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2534                                 process++;
2535                         }else if (pid == 0){
2536                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
2537                                 
2538                                 //pass numSeqs to parent
2539                                 ofstream out;
2540                                 string tempFile = filename + toString(getpid()) + ".num.temp";
2541                                 m->openOutputFile(tempFile, out);
2542                                 out << num << endl;
2543                                 out.close();
2544                                 
2545                                 exit(0);
2546                         }else { 
2547                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2548                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2549                                 exit(0);
2550                         }
2551                 }
2552                 
2553         num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
2554         
2555                 //force parent to wait until all the processes are done
2556                 for (int i=0;i<processIDS.size();i++) { 
2557                         int temp = processIDS[i];
2558                         wait(&temp);
2559                 }
2560                 
2561                 for (int i = 0; i < processIDS.size(); i++) {
2562                         ifstream in;
2563                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
2564                         m->openInputFile(tempFile, in);
2565                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
2566                         in.close(); m->mothurRemove(tempFile);
2567             
2568             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
2569             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
2570                         
2571             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
2572             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
2573                 }
2574                 
2575         //read badSeqs in because root process doesnt know what other "bad" seqs the children found
2576         ifstream inBad;
2577         int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
2578         
2579         if (ableToOpen == 0) {
2580             badSeqNames.clear();
2581             string tempName, trashCode;
2582             while (!inBad.eof()) {
2583                 inBad >> tempName >> trashCode; m->gobble(inBad);
2584                 badSeqNames[tempName] = trashCode;
2585             }
2586             inBad.close();
2587         }
2588 #else
2589         
2590         //////////////////////////////////////////////////////////////////////////////////////////////////////
2591                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
2592                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2593                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
2594                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2595                 
2596                 vector<sumScreenData*> pDataArray; 
2597                 DWORD   dwThreadIdArray[processors-1];
2598                 HANDLE  hThreadArray[processors-1]; 
2599                 
2600                 //Create processor worker threads.
2601                 for( int i=0; i<processors-1; i++ ){
2602             
2603             string extension = "";
2604             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
2605             
2606                         // Allocate memory for thread data.
2607                         sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN, badSeqNames, filename, summaryfile, contigsreport, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
2608                         pDataArray.push_back(tempSum);
2609                         
2610                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
2611                         hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2612                 }
2613                 
2614         //do your part
2615         num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
2616         processIDS.push_back(processors-1);
2617         
2618                 //Wait until all threads have terminated.
2619                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2620                 
2621                 //Close all thread handles and free memory allocations.
2622                 for(int i=0; i < pDataArray.size(); i++){
2623                         num += pDataArray[i]->count;
2624             if (pDataArray[i]->count != pDataArray[i]->end) {
2625                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
2626             }
2627             for (map<string, string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames[it->first] = it->second;       }
2628                         CloseHandle(hThreadArray[i]);
2629                         delete pDataArray[i];
2630                 }
2631         
2632         for (int i = 0; i < processIDS.size(); i++) {
2633             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
2634             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
2635                         
2636             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
2637             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
2638                 }
2639
2640 #endif  
2641         
2642         return num;
2643         
2644         }
2645         catch(exception& e) {
2646                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
2647                 exit(1);
2648         }
2649 }
2650
2651 //***************************************************************************************************************
2652
2653