]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
added modify names parameter to set.dir
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> ScreenSeqsCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
17         CommandParameter pcontigsreport("contigsreport", "InputTypes", "", "", "report", "none", "none","contigsreport",false,true,true); parameters.push_back(pcontigsreport);
18         CommandParameter palignreport("alignreport", "InputTypes", "", "", "report", "none", "none","alignreport",false,false); parameters.push_back(palignreport);
19         CommandParameter psummary("summary", "InputTypes", "", "", "report", "none", "none","summary",false,false); parameters.push_back(psummary);
20         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
21         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
22                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
23                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile);
24                 
25                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax);
26                 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart);
27                 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend);
28                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
29                 CommandParameter pmaxhomop("maxhomop", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxhomop);
30                 CommandParameter pminlength("minlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminlength);
31                 CommandParameter pmaxlength("maxlength", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxlength);
32                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
33                 CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria);
34                 CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize);
35         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37         
38         //report parameters
39         CommandParameter pminoverlap("minoverlap", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminoverlap);
40         CommandParameter postart("ostart", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(postart);
41         CommandParameter poend("oend", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(poend);
42         CommandParameter pmismatches("mismatches", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmismatches);
43         CommandParameter pmaxn("maxn", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxn);
44         CommandParameter pminscore("minscore", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminscore);
45         CommandParameter pmaxinsert("maxinsert", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxinsert);
46         CommandParameter pminsim("minsim", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminsim);
47
48                 
49                 
50                 vector<string> myArray;
51                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "ScreenSeqsCommand", "setParameters");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string ScreenSeqsCommand::getHelpString(){      
61         try {
62                 string helpString = "";
63                 helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
64                 helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, contigsreport, summary, taxonomy, optimize, criteria and processors.\n";
65                 helpString += "The fasta parameter is required.\n";
66         helpString += "The contigsreport parameter allows you to use the contigsreport file to determine if a sequence is good. Screening parameters include: minoverlap, ostart, oend and mismatches. \n";
67         helpString += "The alignreport parameter allows you to use the alignreport file to determine if a sequence is good. Screening parameters include: minsim, minscore and maxinsert. \n";
68         helpString += "The summary parameter allows you to use the summary file from summary.seqs to save time processing.\n";
69                 helpString += "The taxonomy parameter allows you to remove bad seqs from taxonomy files.\n";
70                 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
71                 helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n";
72                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
73                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
74                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
75                 helpString += "The maxn parameter allows you to set and maximum number of N's allowed in a sequence. \n";
76         helpString += "The minoverlap parameter allows you to set and minimum overlap. The default is -1. \n";
77         helpString += "The ostart parameter is used to set an overlap position the \"good\" sequences must start by. The default is -1. \n";
78         helpString += "The oend parameter is used to set an overlap position the \"good\" sequences must end after. The default is -1.\n";
79         helpString += "The mismatches parameter allows you to set and maximum mismatches in the contigs.report. \n";
80         helpString += "The minsim parameter allows you to set the minimum similarity to template sequences during alignment. Found in column \'SimBtwnQuery&Template\' in align.report file.\n";
81         helpString += "The minscore parameter allows you to set the minimum search score during alignment. Found in column \'SearchScore\' in align.report file.\n";
82         helpString += "The maxinsert parameter allows you to set the maximum number of insertions during alignment. Found in column \'LongestInsert\' in align.report file.\n";
83                 helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n";
84                 helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n";
85                 helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n";
86                 helpString += "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n";
87                 helpString += "The screen.seqs command should be in the following format: \n";
88                 helpString += "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n";
89                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
90                 helpString += "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92                 return helpString;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "ScreenSeqsCommand", "getHelpString");
96                 exit(1);
97         }
98 }
99 //**********************************************************************************************************************
100 string ScreenSeqsCommand::getOutputPattern(string type) {
101     try {
102         string pattern = "";
103         
104         if (type == "fasta")            {   pattern = "[filename],good,[extension]";    }
105         else if (type == "taxonomy")    {   pattern = "[filename],good,[extension]";    }
106         else if (type == "name")        {   pattern = "[filename],good,[extension]";    }
107         else if (type == "group")       {   pattern = "[filename],good,[extension]";    }
108         else if (type == "count")       {   pattern = "[filename],good,[extension]";    }
109         else if (type == "accnos")      {   pattern = "[filename],bad.accnos";          }
110         else if (type == "qfile")       {   pattern = "[filename],good,[extension]";    }
111         else if (type == "alignreport")      {   pattern = "[filename],good.align.report";    }
112         else if (type == "contigsreport")      {   pattern = "[filename],good.contigs.report";    }
113         else if (type == "summary")      {   pattern = "[filename],good.summary";    }
114         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
115         
116         return pattern;
117     }
118     catch(exception& e) {
119         m->errorOut(e, "ScreenSeqsCommand", "getOutputPattern");
120         exit(1);
121     }
122 }
123 //**********************************************************************************************************************
124 ScreenSeqsCommand::ScreenSeqsCommand(){ 
125         try {
126                 abort = true; calledHelp = true; 
127                 setParameters();
128                 vector<string> tempOutNames;
129                 outputTypes["fasta"] = tempOutNames;
130                 outputTypes["name"] = tempOutNames;
131                 outputTypes["group"] = tempOutNames;
132                 outputTypes["alignreport"] = tempOutNames;
133         outputTypes["contigsreport"] = tempOutNames;
134         outputTypes["summary"] = tempOutNames;
135                 outputTypes["accnos"] = tempOutNames;
136                 outputTypes["qfile"] = tempOutNames;
137                 outputTypes["taxonomy"] = tempOutNames;
138         outputTypes["count"] = tempOutNames;
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
142                 exit(1);
143         }
144 }
145 //***************************************************************************************************************
146
147 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
148         try {
149                 abort = false; calledHelp = false;   
150                 
151                 //allow user to run help
152                 if(option == "help") { help(); abort = true; calledHelp = true; }
153                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
154                 
155                 else {
156                         vector<string> myArray = setParameters();
157                         
158                         OptionParser parser(option);
159                         map<string,string> parameters = parser.getParameters();
160                         
161                         ValidParameters validParameter("screen.seqs");
162                         map<string,string>::iterator it;
163                         
164                         //check to make sure all parameters are valid for command
165                         for (it = parameters.begin(); it != parameters.end(); it++) { 
166                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
167                         }
168                         
169                         //initialize outputTypes
170                         vector<string> tempOutNames;
171                         outputTypes["fasta"] = tempOutNames;
172                         outputTypes["name"] = tempOutNames;
173                         outputTypes["group"] = tempOutNames;
174                         outputTypes["alignreport"] = tempOutNames;
175                         outputTypes["accnos"] = tempOutNames;
176                         outputTypes["qfile"] = tempOutNames;
177                         outputTypes["taxonomy"] = tempOutNames;
178             outputTypes["count"] = tempOutNames;
179                         outputTypes["contigsreport"] = tempOutNames;
180             outputTypes["summary"] = tempOutNames;
181
182             
183                         //if the user changes the input directory command factory will send this info to us in the output parameter 
184                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
185                         if (inputDir == "not found"){   inputDir = "";          }
186                         else {
187                                 string path;
188                                 it = parameters.find("fasta");
189                                 //user has given a template file
190                                 if(it != parameters.end()){ 
191                                         path = m->hasPath(it->second);
192                                         //if the user has not given a path then, add inputdir. else leave path alone.
193                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
194                                 }
195                                 
196                                 it = parameters.find("group");
197                                 //user has given a template file
198                                 if(it != parameters.end()){ 
199                                         path = m->hasPath(it->second);
200                                         //if the user has not given a path then, add inputdir. else leave path alone.
201                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
202                                 }
203                                 
204                                 it = parameters.find("name");
205                                 //user has given a template file
206                                 if(it != parameters.end()){ 
207                                         path = m->hasPath(it->second);
208                                         //if the user has not given a path then, add inputdir. else leave path alone.
209                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
210                                 }
211                                 
212                                 it = parameters.find("alignreport");
213                                 //user has given a template file
214                                 if(it != parameters.end()){ 
215                                         path = m->hasPath(it->second);
216                                         //if the user has not given a path then, add inputdir. else leave path alone.
217                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
218                                 }
219                 
220                 it = parameters.find("contigsreport");
221                                 //user has given a template file
222                                 if(it != parameters.end()){ 
223                                         path = m->hasPath(it->second);
224                                         //if the user has not given a path then, add inputdir. else leave path alone.
225                                         if (path == "") {       parameters["contigsreport"] = inputDir + it->second;            }
226                                 }
227                 
228                 it = parameters.find("summary");
229                                 //user has given a template file
230                                 if(it != parameters.end()){ 
231                                         path = m->hasPath(it->second);
232                                         //if the user has not given a path then, add inputdir. else leave path alone.
233                                         if (path == "") {       parameters["summary"] = inputDir + it->second;          }
234                                 }
235                                 
236                                 it = parameters.find("qfile");
237                                 //user has given a template file
238                                 if(it != parameters.end()){ 
239                                         path = m->hasPath(it->second);
240                                         //if the user has not given a path then, add inputdir. else leave path alone.
241                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
242                                 }
243                                 
244                                 it = parameters.find("taxonomy");
245                                 //user has given a template file
246                                 if(it != parameters.end()){ 
247                                         path = m->hasPath(it->second);
248                                         //if the user has not given a path then, add inputdir. else leave path alone.
249                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
250                                 }
251                 
252                 it = parameters.find("count");
253                                 //user has given a template file
254                                 if(it != parameters.end()){ 
255                                         path = m->hasPath(it->second);
256                                         //if the user has not given a path then, add inputdir. else leave path alone.
257                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
258                                 }
259                         }
260
261                         //check for required parameters
262                         fastafile = validParameter.validFile(parameters, "fasta", true);
263                         if (fastafile == "not found") {                         
264                                 fastafile = m->getFastaFile(); 
265                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
266                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
267                         }
268                         else if (fastafile == "not open") { abort = true; }
269                         else { m->setFastaFile(fastafile); }
270         
271                         groupfile = validParameter.validFile(parameters, "group", true);
272                         if (groupfile == "not open") { abort = true; }  
273                         else if (groupfile == "not found") { groupfile = ""; }
274                         else { m->setGroupFile(groupfile); }
275                         
276                         qualfile = validParameter.validFile(parameters, "qfile", true);
277                         if (qualfile == "not open") { abort = true; }   
278                         else if (qualfile == "not found") { qualfile = ""; }
279                         else { m->setQualFile(qualfile); }
280                         
281                         namefile = validParameter.validFile(parameters, "name", true);
282                         if (namefile == "not open") { namefile = ""; abort = true; }
283                         else if (namefile == "not found") { namefile = ""; }    
284                         else { m->setNameFile(namefile); }
285                         
286             countfile = validParameter.validFile(parameters, "count", true);
287                         if (countfile == "not open") { countfile = ""; abort = true; }
288                         else if (countfile == "not found") { countfile = "";  } 
289                         else { m->setCountTableFile(countfile); }
290             
291             contigsreport = validParameter.validFile(parameters, "contigsreport", true);
292                         if (contigsreport == "not open") { contigsreport = ""; abort = true; }
293                         else if (contigsreport == "not found") { contigsreport = "";  } 
294             
295             summaryfile = validParameter.validFile(parameters, "summary", true);
296                         if (summaryfile == "not open") { summaryfile = ""; abort = true; }
297                         else if (summaryfile == "not found") { summaryfile = "";  }
298             else { m->setSummaryFile(summaryfile); }
299             
300             if ((namefile != "") && (countfile != "")) {
301                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
302             }
303                         
304             if ((groupfile != "") && (countfile != "")) {
305                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
306             }
307             
308                         alignreport = validParameter.validFile(parameters, "alignreport", true);
309                         if (alignreport == "not open") { abort = true; }
310                         else if (alignreport == "not found") { alignreport = ""; }
311                         
312                         taxonomy = validParameter.validFile(parameters, "taxonomy", true);
313                         if (taxonomy == "not open") { abort = true; }
314                         else if (taxonomy == "not found") { taxonomy = ""; }    
315                         
316                         //if the user changes the output directory command factory will send this info to us in the output parameter 
317                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
318                                 outputDir = ""; 
319                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
320                         }
321
322                         //check for optional parameter and set defaults
323                         // ...at some point should added some additional type checking...
324                         string temp;
325                         temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
326                         m->mothurConvert(temp, startPos); 
327                 
328                         temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
329                         m->mothurConvert(temp, endPos);  
330
331                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
332                         m->mothurConvert(temp, maxAmbig);  
333
334                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
335                         m->mothurConvert(temp, maxHomoP);  
336
337                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
338                         m->mothurConvert(temp, minLength); 
339                         
340                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
341                         m->mothurConvert(temp, maxLength); 
342                         
343                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
344                         m->setProcessors(temp);
345                         m->mothurConvert(temp, processors);
346                         
347             temp = validParameter.validFile(parameters, "minoverlap", false);   if (temp == "not found") { temp = "-1"; }
348                         m->mothurConvert(temp, minOverlap); 
349             
350             temp = validParameter.validFile(parameters, "ostart", false);       if (temp == "not found") { temp = "-1"; }
351                         m->mothurConvert(temp, oStart); 
352             
353             temp = validParameter.validFile(parameters, "oend", false); if (temp == "not found") { temp = "-1"; }
354                         m->mothurConvert(temp, oEnd); 
355             
356             temp = validParameter.validFile(parameters, "mismatches", false);   if (temp == "not found") { temp = "-1"; }
357                         m->mothurConvert(temp, mismatches); 
358             
359             temp = validParameter.validFile(parameters, "maxn", false); if (temp == "not found") { temp = "-1"; }
360                         m->mothurConvert(temp, maxN); 
361             
362             temp = validParameter.validFile(parameters, "minscore", false);     if (temp == "not found") { temp = "-1"; }
363                         m->mothurConvert(temp, minScore); 
364             
365             temp = validParameter.validFile(parameters, "maxinsert", false);    if (temp == "not found") { temp = "-1"; }
366                         m->mothurConvert(temp, maxInsert); 
367             
368             temp = validParameter.validFile(parameters, "minsim", false);       if (temp == "not found") { temp = "-1"; }
369                         m->mothurConvert(temp, minSim); 
370             
371                         temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value
372                         if (temp == "not found"){       temp = "none";          }
373                         m->splitAtDash(temp, optimize);         
374             
375             if ((contigsreport != "") && ((summaryfile != "") || ( alignreport != ""))) {
376                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
377             }
378             
379             if ((alignreport != "") && ((summaryfile != "") || ( contigsreport != ""))) {
380                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
381             }
382             
383             if ((summaryfile != "") && ((alignreport != "") || ( contigsreport != ""))) {
384                 m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true;
385             }
386                         
387             //check to make sure you have the files you need for certain screening
388             if ((contigsreport == "") && ((minOverlap != -1) || (oStart != -1) || (oEnd != -1) || (mismatches != -1))) {
389                 m->mothurOut("[ERROR]: minoverlap, ostart, oend and mismatches can only be used with a contigs.report file, aborting.\n"); abort=true;
390             }
391             
392             if ((alignreport == "") && ((minScore != -1) || (maxInsert != -1) || (minSim != -1))) {
393                 m->mothurOut("[ERROR]: minscore, maxinsert and minsim can only be used with a align.report file, aborting.\n"); abort=true;
394             }
395             
396                         //check for invalid optimize options
397                         set<string> validOptimizers;
398                         validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); validOptimizers.insert("maxn");
399             if (contigsreport != "")    { validOptimizers.insert("minoverlap"); validOptimizers.insert("ostart"); validOptimizers.insert("oend"); validOptimizers.insert("mismatches");  }
400             if (alignreport != "")      { validOptimizers.insert("minscore"); validOptimizers.insert("maxinsert"); validOptimizers.insert("minsim"); }
401             
402                         for (int i = 0; i < optimize.size(); i++) { 
403                                 if (validOptimizers.count(optimize[i]) == 0) { 
404                                         m->mothurOut(optimize[i] + " is not a valid optimizer with your input files. Valid options are "); 
405                     string valid = "";
406                     for (set<string>::iterator it = validOptimizers.begin(); it != validOptimizers.end(); it++) {
407                         valid += (*it) + ", ";
408                     }
409                     if (valid.length() != 0) {  valid = valid.substr(0, valid.length()-2); }
410                     m->mothurOut(valid + ".");
411                     m->mothurOutEndLine();
412                                         optimize.erase(optimize.begin()+i);
413                                         i--;
414                                 }
415                         }
416                         
417                         if (optimize.size() == 1) { if (optimize[0] == "none") { optimize.clear(); } }
418                         
419                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
420                         m->mothurConvert(temp, criteria); 
421                         
422                         if (countfile == "") { 
423                 if (namefile == "") {
424                     vector<string> files; files.push_back(fastafile);
425                     parser.getNameFile(files);
426                 }
427             }
428                 }
429
430         }
431         catch(exception& e) {
432                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
433                 exit(1);
434         }
435 }
436 //***************************************************************************************************************
437
438 int ScreenSeqsCommand::execute(){
439         try{
440                 
441                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
442                 
443         map<string, string> badSeqNames;
444         int start = time(NULL);
445         int numFastaSeqs = 0;
446         
447         if ((contigsreport == "") && (summaryfile == "") && (alignreport == "")) {   numFastaSeqs = screenFasta(badSeqNames);  }
448         else {   numFastaSeqs = screenReports(badSeqNames);   }
449                 
450         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
451         
452         #ifdef USE_MPI
453             int pid;
454             MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
455         
456             if (pid == 0) { //only one process should fix files
457         #endif  
458                 
459                 if(namefile != "" && groupfile != "")   {       
460                         screenNameGroupFile(badSeqNames);       
461                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
462                 }else if(namefile != "")        {       
463                         screenNameGroupFile(badSeqNames);
464                         if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }       
465                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
466                 else if (countfile != "") {     screenCountFile(badSeqNames);           }
467             
468                 
469                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
470
471                 if(qualfile != "")                                              {       screenQual(badSeqNames);                        }
472                 if(taxonomy != "")                                              {       screenTaxonomy(badSeqNames);            }
473                 
474                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
475                 
476                 #ifdef USE_MPI
477                         }
478                 #endif
479
480                 m->mothurOutEndLine();
481                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
482                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
483                 m->mothurOutEndLine();
484                 m->mothurOutEndLine();
485                 
486                 //set fasta file as new current fastafile
487                 string current = "";
488                 itTypes = outputTypes.find("fasta");
489                 if (itTypes != outputTypes.end()) {
490                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
491                 }
492                 
493                 itTypes = outputTypes.find("name");
494                 if (itTypes != outputTypes.end()) {
495                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
496                 }
497                 
498                 itTypes = outputTypes.find("group");
499                 if (itTypes != outputTypes.end()) {
500                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
501                 }
502                 
503                 itTypes = outputTypes.find("qfile");
504                 if (itTypes != outputTypes.end()) {
505                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
506                 }
507                 
508                 itTypes = outputTypes.find("taxonomy");
509                 if (itTypes != outputTypes.end()) {
510                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
511                 }
512         
513         itTypes = outputTypes.find("count");
514                 if (itTypes != outputTypes.end()) {
515                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
516                 }
517
518                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
519                 m->mothurOutEndLine();
520
521                 return 0;
522         }
523         catch(exception& e) {
524                 m->errorOut(e, "ScreenSeqsCommand", "execute");
525                 exit(1);
526         }
527 }
528 //***************************************************************************************************************/
529 int ScreenSeqsCommand::runFastaScreening(map<string, string>& badSeqNames){
530         try{
531         int numFastaSeqs = 0;
532         map<string, string> variables; 
533         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
534         string badAccnosFile =  getOutputFileName("accnos",variables);
535         variables["[extension]"] = m->getExtension(fastafile);
536                 string goodSeqFile = getOutputFileName("fasta", variables);
537                 outputNames.push_back(goodSeqFile); outputTypes["fasta"].push_back(goodSeqFile);
538                 outputNames.push_back(badAccnosFile); outputTypes["accnos"].push_back(badAccnosFile);
539         
540 #ifdef USE_MPI  
541         int pid, numSeqsPerProcessor; 
542         int tag = 2001;
543         vector<unsigned long long> MPIPos;
544         
545         MPI_Status status; 
546         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
547         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
548         
549         MPI_File inMPI;
550         MPI_File outMPIGood;
551         MPI_File outMPIBadAccnos;
552         
553         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
554         int inMode=MPI_MODE_RDONLY; 
555         
556         char outGoodFilename[1024];
557         strcpy(outGoodFilename, goodSeqFile.c_str());
558         
559         char outBadAccnosFilename[1024];
560         strcpy(outBadAccnosFilename, badAccnosFile.c_str());
561         
562         char inFileName[1024];
563         strcpy(inFileName, fastafile.c_str());
564         
565         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
566         MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood);
567         MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos);
568         
569         if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
570         
571         if (pid == 0) { //you are the root process 
572             
573             MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs
574             
575             //send file positions to all processes
576             for(int i = 1; i < processors; i++) { 
577                 MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
578                 MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
579             }
580             
581             //figure out how many sequences you have to align
582             numSeqsPerProcessor = numFastaSeqs / processors;
583             int startIndex =  pid * numSeqsPerProcessor;
584             if(pid == (processors - 1)){        numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
585             
586             //align your part
587             driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
588             
589             if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos);  return 0; }
590             
591             for (int i = 1; i < processors; i++) {
592                 //get bad lists
593                 int badSize;
594                 MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
595             }
596         }else{ //you are a child process
597             MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
598             MPIPos.resize(numFastaSeqs+1);
599             MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
600             
601             //figure out how many sequences you have to align
602             numSeqsPerProcessor = numFastaSeqs / processors;
603             int startIndex =  pid * numSeqsPerProcessor;
604             if(pid == (processors - 1)){        numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
605             
606             //align your part
607             driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
608             
609             if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIGood);  MPI_File_close(&outMPIBadAccnos); return 0; }
610             
611             //send bad list     
612             int badSize = badSeqNames.size();
613             MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
614         }
615         
616         //close files 
617         MPI_File_close(&inMPI);
618         MPI_File_close(&outMPIGood);
619         MPI_File_close(&outMPIBadAccnos);
620         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
621         
622 #else
623         if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);       }       
624         else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
625         
626         if (m->control_pressed) { m->mothurRemove(goodSeqFile); return numFastaSeqs; }
627 #endif          
628         
629 #ifdef USE_MPI
630         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
631         
632         if (pid == 0) { //only one process should fix files
633                         
634             //read accnos file with all names in it, process 0 just has its names
635             MPI_File inMPIAccnos;
636             MPI_Offset size;
637                         
638             char inFileName[1024];
639             strcpy(inFileName, badAccnosFile.c_str());
640                         
641             MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos);  //comm, filename, mode, info, filepointer
642             MPI_File_get_size(inMPIAccnos, &size);
643                         
644             char* buffer = new char[size];
645             MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status);
646                         
647             string tempBuf = buffer;
648             if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
649             istringstream iss (tempBuf,istringstream::in);
650             
651             delete buffer;
652             MPI_File_close(&inMPIAccnos);
653             
654             badSeqNames.clear();
655             string tempName, trashCode;
656             while (!iss.eof()) {
657                 iss >> tempName >> trashCode; m->gobble(iss);
658                 badSeqNames[tempName] = trashCode;
659             }
660         }
661 #endif
662         
663         
664                 return numFastaSeqs;
665
666         }
667         catch(exception& e) {
668                 m->errorOut(e, "ScreenSeqsCommand", "runFastaScreening");
669                 exit(1);
670         }
671 }
672 //***************************************************************************************************************/
673 int ScreenSeqsCommand::screenReports(map<string, string>& badSeqNames){
674         try{
675         int numFastaSeqs = 0;
676         bool summarizedFasta = false;
677         
678         //did not provide a summary file, but set a parameter that requires summarizing the fasta file
679         //or did provide a summary file, but set maxn parameter so we must summarize the fasta file 
680         vector<unsigned long long> positions;
681         if (((summaryfile == "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize)))) || ((summaryfile != "") && m->inUsersGroups("maxn", optimize))) {  
682             //use the namefile to optimize correctly
683             if (namefile != "") { nameMap = m->readNames(namefile); }
684             else if (countfile != "") {
685                 CountTable ct;
686                 ct.readTable(countfile, true);
687                 nameMap = ct.getNameMap();
688             }
689             getSummary(positions); 
690             summarizedFasta = true;
691         } else {
692             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
693                 positions = m->divideFile(fastafile, processors);
694                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
695             #else 
696                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
697                 else {
698                     int numFastaSeqs = 0;
699                     positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
700                     if (positions.size() < processors) { processors = positions.size(); }
701                 
702                     //figure out how many sequences you have to process
703                     int numSeqsPerProcessor = numFastaSeqs / processors;
704                     for (int i = 0; i < processors; i++) {
705                         int startIndex =  i * numSeqsPerProcessor;
706                         if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
707                         lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
708                     }
709                 }
710             #endif
711         }
712         
713         if ((summaryfile != "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize))) && !summarizedFasta) { //summarize based on summaryfile
714             if (namefile != "") { nameMap = m->readNames(namefile); }
715             else if (countfile != "") {
716                 CountTable ct;
717                 ct.readTable(countfile, true);
718                 nameMap = ct.getNameMap();
719             }
720             getSummaryReport();
721         }else if ((contigsreport != "") && ((m->inUsersGroups("minoverlap", optimize)) || (m->inUsersGroups("ostart", optimize)) || (m->inUsersGroups("oend", optimize)) || (m->inUsersGroups("mismatches", optimize)))) { //optimize settings based on contigs file
722             optimizeContigs();
723         }else if ((alignreport != "") && ((m->inUsersGroups("minsim", optimize)) || (m->inUsersGroups("minscore", optimize)) || (m->inUsersGroups("maxinsert", optimize)))) { //optimize settings based on contigs file
724             optimizeAlign();
725         }
726         
727         
728         //provided summary file, and did not set maxn so no need to summarize fasta
729         if (summaryfile != "")      {   numFastaSeqs = screenSummary(badSeqNames);  }
730         //add in any seqs that fail due to contigs report results
731         else if (contigsreport != "")    {   numFastaSeqs = screenContigs(badSeqNames);  }
732         //add in any seqs that fail due to align report
733         else if (alignreport != "")      {   numFastaSeqs = screenAlignReport(badSeqNames);  }
734         
735         return numFastaSeqs;
736     }
737         catch(exception& e) {
738                 m->errorOut(e, "ScreenSeqsCommand", "screenReports");
739                 exit(1);
740         }
741 }
742 //***************************************************************************************************************
743 int ScreenSeqsCommand::screenAlignReport(map<string, string>& badSeqNames){
744         try {
745         
746         map<string, string> variables; 
747         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport));
748         string outSummary =  getOutputFileName("alignreport",variables);
749                 outputNames.push_back(outSummary); outputTypes["alignreport"].push_back(outSummary);
750         
751         string name, TemplateName, SearchMethod, AlignmentMethod;
752         //QueryName     QueryLength     TemplateName    TemplateLength  SearchMethod    SearchScore     AlignmentMethod QueryStart      QueryEnd        TemplateStart   TemplateEnd     PairwiseAlignmentLength GapsInQuery     GapsInTemplate  LongestInsert   SimBtwnQuery&Template
753         //checking for minScore, maxInsert, minSim
754         int length, TemplateLength,      QueryStart,    QueryEnd,       TemplateStart,  TemplateEnd,    PairwiseAlignmentLength,        GapsInQuery,    GapsInTemplate, LongestInsert;
755         float SearchScore, SimBtwnQueryTemplate;
756         
757         ofstream out;
758         m->openOutputFile(outSummary, out);
759         
760         //read summary file
761         ifstream in;
762         m->openInputFile(alignreport, in);
763         out << (m->getline(in)) << endl;   //skip headers
764         
765                 int count = 0;
766         
767                 while (!in.eof()) {
768             
769             if (m->control_pressed) { in.close(); out.close(); return 0; }
770             
771             //seqname   start   end     nbases  ambigs  polymer numSeqs
772             in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
773
774             bool goodSeq = 1;           //      innocent until proven guilty
775             string trashCode = "";
776             if(maxInsert != -1 && maxInsert < LongestInsert)    {       goodSeq = 0; trashCode += "insert|";    }
777             if(minScore != -1 && minScore > SearchScore)                {       goodSeq = 0; trashCode += "score|";     }
778             if(minSim != -1 && minSim > SimBtwnQueryTemplate)   {       goodSeq = 0; trashCode += "sim|";       }
779             
780             if(goodSeq == 1){
781                 out << name << '\t' << length << '\t' << TemplateName  << '\t' << TemplateLength  << '\t' << SearchMethod  << '\t' << SearchScore  << '\t' << AlignmentMethod  << '\t' << QueryStart  << '\t' << QueryEnd  << '\t' << TemplateStart  << '\t' << TemplateEnd  << '\t' << PairwiseAlignmentLength  << '\t' << GapsInQuery  << '\t' << GapsInTemplate  << '\t' << LongestInsert  << '\t' << SimBtwnQueryTemplate << endl;
782             }
783             else{ badSeqNames[name] = trashCode;  }
784             count++;
785         }
786         in.close();
787         out.close();
788         
789         int oldBadSeqsCount = badSeqNames.size();
790         
791         int numFastaSeqs = runFastaScreening(badSeqNames);
792         
793         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
794             m->renameFile(outSummary, outSummary+".temp");
795             
796             ofstream out2;
797             m->openOutputFile(outSummary, out2);
798             
799             //read summary file
800             ifstream in2;
801             m->openInputFile(outSummary+".temp", in2);
802             out2 << (m->getline(in2)) << endl;   //skip headers
803             
804             while (!in2.eof()) {
805                 
806                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
807                 
808                 //seqname       start   end     nbases  ambigs  polymer numSeqs
809                 in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2);
810                 
811                 if (badSeqNames.count(name) == 0) { //are you good?
812                     out2 << name << '\t' << length << '\t' << TemplateName  << '\t' << TemplateLength  << '\t' << SearchMethod  << '\t' << SearchScore  << '\t' << AlignmentMethod  << '\t' << QueryStart  << '\t' << QueryEnd  << '\t' << TemplateStart  << '\t' << TemplateEnd  << '\t' << PairwiseAlignmentLength  << '\t' << GapsInQuery  << '\t' << GapsInTemplate  << '\t' << LongestInsert  << '\t' << SimBtwnQueryTemplate << endl;             
813                 }
814             }
815             in2.close();
816             out2.close();
817             m->mothurRemove(outSummary+".temp");
818         }
819         
820         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
821         
822         
823         return count;
824         
825                 return 0;
826         
827         }
828         catch(exception& e) {
829                 m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport");
830                 exit(1);
831         }
832         
833 }
834 //***************************************************************************************************************/
835 int ScreenSeqsCommand::screenContigs(map<string, string>& badSeqNames){
836         try{
837         map<string, string> variables; 
838         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport));
839         string outSummary =  getOutputFileName("contigsreport",variables);
840                 outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary);
841         
842         string name;
843         //Name  Length  Overlap_Length  Overlap_Start   Overlap_End     MisMatches      Num_Ns
844         int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs;
845         
846         ofstream out;
847         m->openOutputFile(outSummary, out);
848         
849         //read summary file
850         ifstream in;
851         m->openInputFile(contigsreport, in);
852         out << (m->getline(in)) << endl;   //skip headers
853         
854                 int count = 0;
855         
856                 while (!in.eof()) {
857             
858             if (m->control_pressed) { in.close(); out.close(); return 0; }
859             
860             //seqname   start   end     nbases  ambigs  polymer numSeqs
861             in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in);
862             
863             bool goodSeq = 1;           //      innocent until proven guilty
864             string trashCode = "";
865             if(oStart != -1 && oStart < thisOStart)             {       goodSeq = 0;    trashCode += "ostart|";     }
866             if(oEnd != -1 && oEnd > thisOEnd)                   {       goodSeq = 0;    trashCode += "oend|";       }
867             if(maxN != -1 && maxN <     numNs)                      {   goodSeq = 0;    trashCode += "n|";          }
868             if(minOverlap != -1 && minOverlap > OLength)                {       goodSeq = 0;    trashCode += "olength|";    }
869             if(mismatches != -1 && mismatches < numMisMatches)  {       goodSeq = 0;    trashCode += "mismatches|"; }
870             
871             if(goodSeq == 1){
872                 out << name << '\t' << length  << '\t' << OLength  << '\t' << thisOStart  << '\t' << thisOEnd  << '\t' << numMisMatches  << '\t' << numNs << endl;      
873             }
874             else{ badSeqNames[name] = trashCode; }
875             count++;
876         }
877         in.close();
878         out.close();
879         
880         int oldBadSeqsCount = badSeqNames.size();
881         
882         int numFastaSeqs = runFastaScreening(badSeqNames);
883         
884         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
885             m->renameFile(outSummary, outSummary+".temp");
886             
887             ofstream out2;
888             m->openOutputFile(outSummary, out2);
889             
890             //read summary file
891             ifstream in2;
892             m->openInputFile(outSummary+".temp", in2);
893             out2 << (m->getline(in2)) << endl;   //skip headers
894             
895             while (!in2.eof()) {
896                 
897                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
898                 
899                 //seqname       start   end     nbases  ambigs  polymer numSeqs
900                 in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2);
901                 
902                 if (badSeqNames.count(name) == 0) { //are you good?
903                     out2 << name << '\t' << length  << '\t' << OLength  << '\t' << thisOStart  << '\t' << thisOEnd  << '\t' << numMisMatches  << '\t' << numNs << endl;         
904                 }
905             }
906             in2.close();
907             out2.close();
908             m->mothurRemove(outSummary+".temp");
909         }
910         
911         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
912         
913         
914         return count;
915         
916     }
917         catch(exception& e) {
918                 m->errorOut(e, "ScreenSeqsCommand", "screenContigs");
919                 exit(1);
920         }
921 }
922 //***************************************************************************************************************/
923 int ScreenSeqsCommand::screenSummary(map<string, string>& badSeqNames){
924         try{
925         map<string, string> variables; 
926         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile));
927         string outSummary =  getOutputFileName("summary",variables);
928                 outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary);
929         
930         string name;
931         int start, end, length, ambigs, polymer, numReps;
932         
933         ofstream out;
934         m->openOutputFile(outSummary, out);
935                 
936         //read summary file
937         ifstream in;
938         m->openInputFile(summaryfile, in);
939         out << (m->getline(in)) << endl;   //skip headers
940          
941                 int count = 0;
942         
943                 while (!in.eof()) {
944             
945             if (m->control_pressed) { in.close(); out.close(); return 0; }
946             
947             //seqname   start   end     nbases  ambigs  polymer numSeqs
948             in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
949             
950             bool goodSeq = 1;           //      innocent until proven guilty
951             string trashCode = "";
952             if(startPos != -1 && startPos < start)                      {       goodSeq = 0;    trashCode += "start|"; }
953             if(endPos != -1 && endPos > end)                            {       goodSeq = 0;    trashCode += "end|"; }
954             if(maxAmbig != -1 && maxAmbig <     ambigs)         {       goodSeq = 0;    trashCode += "ambig|"; }
955             if(maxHomoP != -1 && maxHomoP < polymer)        {   goodSeq = 0;    trashCode += "homop|"; }
956             if(minLength != -1 && minLength > length)           {       goodSeq = 0;    trashCode += "<length|"; }
957             if(maxLength != -1 && maxLength < length)           {       goodSeq = 0;    trashCode += ">length|"; }
958             
959             if(goodSeq == 1){
960                 out << name << '\t' << start  << '\t' << end  << '\t' << length  << '\t' << ambigs  << '\t' << polymer  << '\t' << numReps << endl;     
961             }
962             else{ badSeqNames[name] = trashCode; }
963             count++;
964         }
965         in.close();
966         out.close();
967         
968         int oldBadSeqsCount = badSeqNames.size();
969         
970         int numFastaSeqs = runFastaScreening(badSeqNames);
971         
972         if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns
973             m->renameFile(outSummary, outSummary+".temp");
974             
975             ofstream out2;
976             m->openOutputFile(outSummary, out2);
977             
978             //read summary file
979             ifstream in2;
980             m->openInputFile(outSummary+".temp", in2);
981             out2 << (m->getline(in2)) << endl;   //skip headers
982             
983             while (!in2.eof()) {
984                 
985                 if (m->control_pressed) { in2.close(); out2.close(); return 0; }
986                 
987                 //seqname       start   end     nbases  ambigs  polymer numSeqs
988                 in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2);
989                 
990                 if (badSeqNames.count(name) == 0) { //are you good?
991                     out2 << name << '\t' << start  << '\t' << end  << '\t' << length  << '\t' << ambigs  << '\t' << polymer  << '\t' << numReps << endl;        
992                 }
993             }
994             in2.close();
995             out2.close();
996             m->mothurRemove(outSummary+".temp");
997         }
998         
999         if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; }
1000         
1001         
1002         
1003         return count;
1004     }
1005         catch(exception& e) {
1006                 m->errorOut(e, "ScreenSeqsCommand", "screenSummary");
1007                 exit(1);
1008         }
1009 }
1010 //***************************************************************************************************************/
1011 int ScreenSeqsCommand::screenFasta(map<string, string>& badSeqNames){
1012         try{
1013         
1014         
1015         //if the user want to optimize we need to know the 90% mark
1016                 vector<unsigned long long> positions;
1017                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
1018                         //use the namefile to optimize correctly
1019                         if (namefile != "") { nameMap = m->readNames(namefile); }
1020             else if (countfile != "") {
1021                 CountTable ct;
1022                 ct.readTable(countfile, true);
1023                 nameMap = ct.getNameMap();
1024             }
1025                         getSummary(positions); 
1026                 }else { 
1027 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1028             positions = m->divideFile(fastafile, processors);
1029             for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
1030 #else 
1031             if(processors == 1){ lines.push_back(linePair(0, 1000));  }
1032             else {
1033                 int numFastaSeqs = 0;
1034                 positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
1035                 if (positions.size() < processors) { processors = positions.size(); }
1036                 
1037                 //figure out how many sequences you have to process
1038                 int numSeqsPerProcessor = numFastaSeqs / processors;
1039                 for (int i = 0; i < processors; i++) {
1040                     int startIndex =  i * numSeqsPerProcessor;
1041                     if(i == (processors - 1)){  numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
1042                     lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1043                 }
1044             }
1045 #endif
1046                 }
1047         
1048         if (m->control_pressed) { return 0; }
1049         
1050         int numFastaSeqs = runFastaScreening(badSeqNames);
1051         
1052         return numFastaSeqs;
1053         
1054     }
1055         catch(exception& e) {
1056                 m->errorOut(e, "ScreenSeqsCommand", "screenFasta");
1057                 exit(1);
1058         }
1059 }       
1060 //***************************************************************************************************************
1061
1062 int ScreenSeqsCommand::screenNameGroupFile(map<string, string> badSeqNames){
1063         try {
1064                 ifstream inputNames;
1065                 m->openInputFile(namefile, inputNames);
1066                 map<string, string> badSeqGroups;
1067                 string seqName, seqList, group;
1068                 map<string, string>::iterator it;
1069         map<string, string> variables; 
1070                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
1071         variables["[extension]"] = m->getExtension(namefile);
1072                 string goodNameFile = getOutputFileName("name", variables);
1073                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
1074                 
1075                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
1076                 
1077                 while(!inputNames.eof()){
1078                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); m->mothurRemove(goodNameFile);  return 0; }
1079
1080                         inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList;
1081                         it = badSeqNames.find(seqName);
1082                                 
1083                         if(it != badSeqNames.end()){
1084                                 if(namefile != ""){
1085                                         int start = 0;
1086                                         for(int i=0;i<seqList.length();i++){
1087                                                 if(seqList[i] == ','){
1088                                                         badSeqGroups[seqList.substr(start,i-start)] = it->second;
1089                                                         start = i+1;
1090                                                 }                                       
1091                                         }
1092                                         badSeqGroups[seqList.substr(start,seqList.length()-start)] = it->second;
1093                                 }
1094                 badSeqNames.erase(it);
1095                         }
1096                         else{
1097                                 goodNameOut << seqName << '\t' << seqList << endl;
1098                         }
1099                         m->gobble(inputNames);
1100                 }
1101                 inputNames.close();
1102                 goodNameOut.close();
1103         
1104                 //we were unable to remove some of the bad sequences
1105                 if (badSeqNames.size() != 0) {
1106                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
1107                                 m->mothurOut("Your namefile does not include the sequence " + it->first + " please correct."); 
1108                                 m->mothurOutEndLine();
1109                         }
1110                 }
1111
1112                 if(groupfile != ""){
1113                         
1114                         ifstream inputGroups;
1115                         m->openInputFile(groupfile, inputGroups);
1116             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
1117             variables["[extension]"] = m->getExtension(groupfile);
1118             string goodGroupFile = getOutputFileName("group", variables);
1119                         
1120                         outputNames.push_back(goodGroupFile);   outputTypes["group"].push_back(goodGroupFile);
1121                         
1122                         ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
1123                         
1124                         while(!inputGroups.eof()){
1125                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile);  m->mothurRemove(goodGroupFile); return 0; }
1126
1127                                 inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
1128                                 
1129                                 it = badSeqGroups.find(seqName);
1130                                 
1131                                 if(it != badSeqGroups.end()){
1132                                         badSeqGroups.erase(it);
1133                                 }
1134                                 else{
1135                                         goodGroupOut << seqName << '\t' << group << endl;
1136                                 }
1137                                 m->gobble(inputGroups);
1138                         }
1139                         inputGroups.close();
1140                         goodGroupOut.close();
1141                         
1142                         //we were unable to remove some of the bad sequences
1143                         if (badSeqGroups.size() != 0) {
1144                                 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
1145                                         m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); 
1146                                         m->mothurOutEndLine();
1147                                 }
1148                         }
1149                 }
1150                 
1151                 
1152                 return 0;
1153         
1154         }
1155         catch(exception& e) {
1156                 m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile");
1157                 exit(1);
1158         }
1159 }
1160 //***************************************************************************************************************
1161 int ScreenSeqsCommand::getSummaryReport(){
1162         try {
1163                 
1164                 vector<int> startPosition;
1165                 vector<int> endPosition;
1166                 vector<int> seqLength;
1167                 vector<int> ambigBases;
1168                 vector<int> longHomoPolymer;
1169         
1170 #ifdef USE_MPI
1171                 int pid;
1172                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1173                 
1174                 if (pid == 0) { 
1175 #endif
1176             
1177             
1178             //read summary file
1179             ifstream in;
1180             m->openInputFile(summaryfile, in);
1181             m->getline(in);
1182             
1183             string name;
1184             int start, end, length, ambigs, polymer, numReps;
1185             
1186             while (!in.eof()) {
1187                 
1188                 if (m->control_pressed) { in.close(); return 0; }
1189                 
1190                 //seqname       start   end     nbases  ambigs  polymer numSeqs
1191                 in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in);
1192                 
1193                 int num = 1;
1194                                 if ((namefile != "") || (countfile !="")) {
1195                                         //make sure this sequence is in the namefile, else error 
1196                                         map<string, int>::iterator it = nameMap.find(name);
1197                                         
1198                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1199                                         else { num = it->second; }
1200                                 }
1201                                 
1202                                 //for each sequence this sequence represents
1203                                 for (int i = 0; i < num; i++) {
1204                                         startPosition.push_back(start);
1205                                         endPosition.push_back(end);
1206                                         seqLength.push_back(length);
1207                                         ambigBases.push_back(ambigs);
1208                                         longHomoPolymer.push_back(polymer);
1209                                 }
1210                
1211             }
1212             in.close();
1213
1214         sort(startPosition.begin(), startPosition.end());
1215                 sort(endPosition.begin(), endPosition.end());
1216                 sort(seqLength.begin(), seqLength.end());
1217                 sort(ambigBases.begin(), ambigBases.end());
1218                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
1219                 
1220                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1221                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
1222                 
1223                 for (int i = 0; i < optimize.size(); i++) {
1224                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
1225                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
1226                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
1227                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
1228                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
1229                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
1230                 }
1231         
1232 #ifdef USE_MPI
1233     }
1234     
1235     MPI_Status status; 
1236     MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1237     MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1238     
1239     if (pid == 0) { 
1240         //send file positions to all processes
1241         for(int i = 1; i < processors; i++) { 
1242             MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1243             MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1244             MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1245             MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1246             MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1247             MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1248         }
1249     }else {
1250         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1251         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1252         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1253         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1254         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1255         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1256     }
1257     MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1258 #endif
1259         return 0;
1260         
1261     }
1262         catch(exception& e) {
1263                 m->errorOut(e, "ScreenSeqsCommand", "getSummaryReport");
1264                 exit(1);
1265         }
1266 }
1267 //***************************************************************************************************************
1268 int ScreenSeqsCommand::optimizeContigs(){
1269         try {
1270                 vector<int> olengths;
1271                 vector<int> oStarts;
1272                 vector<int> oEnds;
1273                 vector<int> numMismatches;
1274         vector<int> numNs;
1275                 
1276         vector<unsigned long long> positions;
1277         vector<linePair> contigsLines;
1278 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1279                 positions = m->divideFilePerLine(contigsreport, processors);
1280                 for (int i = 0; i < (positions.size()-1); i++) { contigsLines.push_back(linePair(positions[i], positions[(i+1)])); }    
1281 #else
1282                 if(processors == 1){ contigsLines.push_back(linePair(0, 1000));  }
1283         else {
1284             int numContigsSeqs = 0;
1285             positions = m->setFilePosEachLine(contigsreport, numContigsSeqs); 
1286             if (positions.size() < processors) { processors = positions.size(); }
1287             
1288             //figure out how many sequences you have to process
1289             int numSeqsPerProcessor = numContigsSeqs / processors;
1290             for (int i = 0; i < processors; i++) {
1291                 int startIndex =  i * numSeqsPerProcessor;
1292                 if(i == (processors - 1)){      numSeqsPerProcessor = numContigsSeqs - i * numSeqsPerProcessor;         }
1293                 contigsLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1294             }
1295         }
1296 #endif
1297                 
1298 #ifdef USE_MPI
1299                 int pid;
1300                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1301                 
1302                 if (pid == 0) { 
1303                         driverContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines[0]);
1304 #else
1305             createProcessesContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines); 
1306             
1307                         if (m->control_pressed) {  return 0; }
1308 #endif
1309             sort(olengths.begin(), olengths.end());
1310             sort(oStarts.begin(), oStarts.end());
1311             sort(oEnds.begin(), oEnds.end());
1312             sort(numMismatches.begin(), numMismatches.end());
1313             sort(numNs.begin(), numNs.end());
1314             
1315             //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1316             int criteriaPercentile      = int(oStarts.size() * (criteria / (float) 100));
1317             
1318             for (int i = 0; i < optimize.size(); i++) {
1319                 if (optimize[i] == "ostart") { oStart = oStarts[criteriaPercentile]; m->mothurOut("Optimizing ostart to " + toString(oStart) + "."); m->mothurOutEndLine(); }
1320                 else if (optimize[i] == "oend") { int endcriteriaPercentile = int(oEnds.size() * ((100 - criteria) / (float) 100));  oEnd = oEnds[endcriteriaPercentile]; m->mothurOut("Optimizing oend to " + toString(oEnd) + "."); m->mothurOutEndLine();}
1321                 else if (optimize[i] == "mismatches") { mismatches = numMismatches[criteriaPercentile]; m->mothurOut("Optimizing mismatches to " + toString(mismatches) + "."); m->mothurOutEndLine(); }
1322                 else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
1323                 else if (optimize[i] == "minoverlap") { int mincriteriaPercentile = int(olengths.size() * ((100 - criteria) / (float) 100)); minOverlap = olengths[mincriteriaPercentile]; m->mothurOut("Optimizing minoverlap to " + toString(minOverlap) + "."); m->mothurOutEndLine(); }
1324
1325             }
1326             
1327 #ifdef USE_MPI
1328                 }
1329                 
1330                 MPI_Status status; 
1331                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1332                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1333         
1334                 if (pid == 0) { 
1335                         //send file positions to all processes
1336                         for(int i = 1; i < processors; i++) { 
1337                 MPI_Send(&minOverlap, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1338                                 MPI_Send(&oStart, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1339                                 MPI_Send(&oEnd, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1340                                 MPI_Send(&mismatches, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1341                 MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1342                         }
1343                 }else {
1344             MPI_Recv(&minOverlap, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1345                         MPI_Recv(&oStart, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1346                         MPI_Recv(&oEnd, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1347                         MPI_Recv(&mismatches, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1348             MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1349                 }
1350                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1351 #endif
1352                 return 0;
1353         }
1354         catch(exception& e) {
1355                 m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
1356                 exit(1);
1357         }
1358 }
1359 /**************************************************************************************/
1360 int ScreenSeqsCommand::driverContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, linePair filePos) {     
1361         try {
1362                 
1363         string name;
1364         //Name  Length  Overlap_Length  Overlap_Start   Overlap_End     MisMatches      Num_Ns
1365         int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
1366         
1367                 ifstream in;
1368                 m->openInputFile(contigsreport, in);
1369         
1370                 in.seekg(filePos.start);
1371         if (filePos.start == 0) { //read headers
1372             m->getline(in); m->gobble(in);
1373         }
1374         
1375                 bool done = false;
1376                 int count = 0;
1377         
1378                 while (!done) {
1379             
1380                         if (m->control_pressed) { in.close(); return 1; }
1381             
1382             //seqname   start   end     nbases  ambigs  polymer numSeqs
1383             in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; m->gobble(in);
1384             
1385             int num = 1;
1386             if ((namefile != "") || (countfile !="")){
1387                 //make sure this sequence is in the namefile, else error 
1388                 map<string, int>::iterator it = nameMap.find(name);
1389                 
1390                 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1391                 else { num = it->second; }
1392             }
1393             
1394             //for each sequence this sequence represents
1395             for (int i = 0; i < num; i++) {
1396                 ostartPosition.push_back(thisOStart);
1397                 oendPosition.push_back(thisOEnd);
1398                 oLength.push_back(OLength);
1399                 omismatches.push_back(numMisMatches);
1400                 numNs.push_back(numns);
1401             }
1402             
1403             count++;
1404                         
1405                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1406 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1407             unsigned long long pos = in.tellg();
1408             if ((pos == -1) || (pos >= filePos.end)) { break; }
1409 #else
1410             if (in.eof()) { break; }
1411 #endif
1412                 }
1413                 
1414                 in.close();
1415                 
1416                 return count;
1417         }
1418         catch(exception& e) {
1419                 m->errorOut(e, "ScreenSeqsCommand", "driverContigsSummary");
1420                 exit(1);
1421         }
1422 }
1423
1424 /**************************************************************************************************/
1425 int ScreenSeqsCommand::createProcessesContigsSummary(vector<int>& oLength, vector<int>& ostartPosition, vector<int>& oendPosition, vector<int>& omismatches, vector<int>& numNs, vector<linePair> contigsLines) {
1426         try {
1427         
1428         int process = 1;
1429                 int num = 0;
1430                 vector<int> processIDS;
1431         
1432 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1433         
1434                 //loop through and create all the processes you want
1435                 while (process != processors) {
1436                         int pid = fork();
1437                         
1438                         if (pid > 0) {
1439                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1440                                 process++;
1441                         }else if (pid == 0){
1442                                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[process]);
1443                                 
1444                                 //pass numSeqs to parent
1445                                 ofstream out;
1446                                 string tempFile = contigsreport + toString(getpid()) + ".num.temp";
1447                                 m->openOutputFile(tempFile, out);
1448                                 
1449                                 out << num << endl;
1450                                 out << ostartPosition.size() << endl;
1451                                 for (int k = 0; k < ostartPosition.size(); k++)         {               out << ostartPosition[k] << '\t';   }  out << endl;
1452                                 for (int k = 0; k < oendPosition.size(); k++)           {               out << oendPosition[k] << '\t';     }  out << endl;
1453                                 for (int k = 0; k < oLength.size(); k++)                        {               out << oLength[k] << '\t';          }  out << endl;
1454                                 for (int k = 0; k < omismatches.size(); k++)        {           out << omismatches[k] << '\t';      }  out << endl;
1455                 for (int k = 0; k < numNs.size(); k++)              {           out << numNs[k] << '\t';            }  out << endl;
1456                                 
1457                                 out.close();
1458                                 
1459                                 exit(0);
1460                         }else { 
1461                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1462                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1463                                 exit(0);
1464                         }
1465                 }
1466                 
1467                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[0]);
1468                 
1469                 //force parent to wait until all the processes are done
1470                 for (int i=0;i<processIDS.size();i++) { 
1471                         int temp = processIDS[i];
1472                         wait(&temp);
1473                 }
1474                 
1475                 //parent reads in and combine Filter info
1476                 for (int i = 0; i < processIDS.size(); i++) {
1477                         string tempFilename = contigsreport + toString(processIDS[i]) + ".num.temp";
1478                         ifstream in;
1479                         m->openInputFile(tempFilename, in);
1480                         
1481                         int temp, tempNum;
1482                         in >> tempNum; m->gobble(in); num += tempNum;
1483                         in >> tempNum; m->gobble(in);
1484                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ostartPosition.push_back(temp);             }               m->gobble(in);
1485                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; oendPosition.push_back(temp);               }               m->gobble(in);
1486                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; oLength.push_back(temp);                    }               m->gobble(in);
1487                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; omismatches.push_back(temp);        }               m->gobble(in);
1488             for (int k = 0; k < tempNum; k++)                   {               in >> temp; numNs.push_back(temp);              }               m->gobble(in);
1489             
1490                         in.close();
1491                         m->mothurRemove(tempFilename);
1492                 }
1493                 
1494                 
1495 #else 
1496         //////////////////////////////////////////////////////////////////////////////////////////////////////
1497                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
1498                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1499                 //Taking advantage of shared memory to allow both threads to add info to vectors.
1500                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1501                 /*
1502                 vector<contigsSumData*> pDataArray; 
1503                 DWORD   dwThreadIdArray[processors-1];
1504                 HANDLE  hThreadArray[processors-1]; 
1505                 
1506                 //Create processor worker threads.
1507                 for( int i=0; i<processors-1; i++ ){
1508             
1509                         // Allocate memory for thread data.
1510                         contigsSumData* tempSum = new contigsSumData(contigsreport, m, contigsLines[i].start, contigsLines[i].end, namefile, countfile, nameMap);
1511                         pDataArray.push_back(tempSum);
1512                         
1513                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1514                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1515                         hThreadArray[i] = CreateThread(NULL, 0, MyContigsSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1516                 }
1517                 */
1518         contigsLines[processors-1].start = 0;
1519         //do your part
1520                 num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[processors-1]);
1521         /*
1522                 //Wait until all threads have terminated.
1523                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1524                 
1525                 //Close all thread handles and free memory allocations.
1526                 for(int i=0; i < pDataArray.size(); i++){
1527                         num += pDataArray[i]->count;
1528             if (pDataArray[i]->count != pDataArray[i]->end) {
1529                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
1530             }
1531             for (int k = 0; k < pDataArray[i]->ostartPosition.size(); k++)  {   ostartPosition.push_back(pDataArray[i]->ostartPosition[k]);     }
1532                         for (int k = 0; k < pDataArray[i]->oendPosition.size(); k++)    {       oendPosition.push_back(pDataArray[i]->oendPosition[k]);         }
1533             for (int k = 0; k < pDataArray[i]->oLength.size(); k++)         {   oLength.push_back(pDataArray[i]->oLength[k]);                   }
1534             for (int k = 0; k < pDataArray[i]->omismatches.size(); k++)     {   omismatches.push_back(pDataArray[i]->omismatches[k]);           }
1535             for (int k = 0; k < pDataArray[i]->numNs.size(); k++)           {   numNs.push_back(pDataArray[i]->numNs[k]);                       }
1536                         CloseHandle(hThreadArray[i]);
1537                         delete pDataArray[i];
1538                 }
1539         */
1540 #endif          
1541         return num;
1542         }
1543         catch(exception& e) {
1544                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesContigsSummary");
1545                 exit(1);
1546         }
1547 }
1548 //***************************************************************************************************************
1549 int ScreenSeqsCommand::optimizeAlign(){
1550         try {
1551         
1552                 vector<float> sims;
1553                 vector<float> scores;
1554                 vector<int> inserts;
1555                 
1556         vector<unsigned long long> positions;
1557         vector<linePair> alignLines;
1558 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1559                 positions = m->divideFilePerLine(alignreport, processors);
1560                 for (int i = 0; i < (positions.size()-1); i++) { alignLines.push_back(linePair(positions[i], positions[(i+1)])); }      
1561 #else
1562                 if(processors == 1){ alignLines.push_back(linePair(0, 1000));  }
1563         else {
1564             int numAlignSeqs = 0;
1565             positions = m->setFilePosEachLine(alignreport, numAlignSeqs); 
1566             if (positions.size() < processors) { processors = positions.size(); }
1567             
1568             //figure out how many sequences you have to process
1569             int numSeqsPerProcessor = numAlignSeqs / processors;
1570             for (int i = 0; i < processors; i++) {
1571                 int startIndex =  i * numSeqsPerProcessor;
1572                 if(i == (processors - 1)){      numSeqsPerProcessor = numAlignSeqs - i * numSeqsPerProcessor;   }
1573                 alignLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1574             }
1575         }
1576 #endif
1577                 
1578 #ifdef USE_MPI
1579                 int pid;
1580                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1581                 
1582                 if (pid == 0) { 
1583                         driverAlignSummary(sims, scores, inserts, alignLines[0]);
1584 #else
1585             createProcessesAlignSummary(sims, scores, inserts, alignLines); 
1586             
1587                         if (m->control_pressed) {  return 0; }
1588 #endif
1589             sort(sims.begin(), sims.end());
1590             sort(scores.begin(), scores.end());
1591             sort(inserts.begin(), inserts.end());
1592             
1593             //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1594             int criteriaPercentile      = int(sims.size() * (criteria / (float) 100));
1595             
1596             for (int i = 0; i < optimize.size(); i++) {
1597                 if (optimize[i] == "minsim") { int mincriteriaPercentile = int(sims.size() * ((100 - criteria) / (float) 100)); minSim = sims[mincriteriaPercentile];  m->mothurOut("Optimizing minsim to " + toString(minSim) + "."); m->mothurOutEndLine();}
1598                 else if (optimize[i] == "minscore") { int mincriteriaPercentile = int(scores.size() * ((100 - criteria) / (float) 100)); minScore = scores[mincriteriaPercentile];  m->mothurOut("Optimizing minscore to " + toString(minScore) + "."); m->mothurOutEndLine(); }
1599                 else if (optimize[i] == "maxinsert") { maxInsert = inserts[criteriaPercentile]; m->mothurOut("Optimizing maxinsert to " + toString(maxInsert) + "."); m->mothurOutEndLine(); }
1600             }
1601             
1602 #ifdef USE_MPI
1603                 }
1604                 
1605                 MPI_Status status; 
1606                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1607                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1608         
1609                 if (pid == 0) { 
1610                         //send file positions to all processes
1611                         for(int i = 1; i < processors; i++) { 
1612                 MPI_Send(&minSim, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1613                                 MPI_Send(&minScore, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1614                                 MPI_Send(&maxInsert, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1615                         }
1616                 }else {
1617             MPI_Recv(&minSim, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1618                         MPI_Recv(&minScore, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1619                         MPI_Recv(&maxInsert, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1620                 }
1621                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1622 #endif
1623                 return 0;
1624         }
1625         catch(exception& e) {
1626                 m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs");
1627                 exit(1);
1628         }
1629 }
1630 /**************************************************************************************/
1631 int ScreenSeqsCommand::driverAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, linePair filePos) { 
1632         try {
1633                 
1634         string name, TemplateName, SearchMethod, AlignmentMethod;
1635         //QueryName     QueryLength     TemplateName    TemplateLength  SearchMethod    SearchScore     AlignmentMethod QueryStart      QueryEnd        TemplateStart   TemplateEnd     PairwiseAlignmentLength GapsInQuery     GapsInTemplate  LongestInsert   SimBtwnQuery&Template
1636         //checking for minScore, maxInsert, minSim
1637         int length, TemplateLength,      QueryStart,    QueryEnd,       TemplateStart,  TemplateEnd,    PairwiseAlignmentLength,        GapsInQuery,    GapsInTemplate, LongestInsert;
1638         float SearchScore, SimBtwnQueryTemplate;
1639          
1640                 ifstream in;
1641                 m->openInputFile(alignreport, in);
1642         
1643                 in.seekg(filePos.start);
1644         if (filePos.start == 0) { //read headers
1645             m->getline(in); m->gobble(in);
1646         }
1647         
1648                 bool done = false;
1649                 int count = 0;
1650         
1651                 while (!done) {
1652             
1653                         if (m->control_pressed) { in.close(); return 1; }
1654             
1655             in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in);
1656             
1657             int num = 1;
1658             if ((namefile != "") || (countfile !="")){
1659                 //make sure this sequence is in the namefile, else error 
1660                 map<string, int>::iterator it = nameMap.find(name);
1661                 
1662                 if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1663                 else { num = it->second; }
1664             }
1665             
1666             //for each sequence this sequence represents
1667             for (int i = 0; i < num; i++) {
1668                 sims.push_back(SimBtwnQueryTemplate);
1669                 scores.push_back(SearchScore);
1670                 inserts.push_back(LongestInsert);
1671             }
1672             
1673             count++;
1674                         
1675                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1676 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1677             unsigned long long pos = in.tellg();
1678             if ((pos == -1) || (pos >= filePos.end)) { break; }
1679 #else
1680             if (in.eof()) { break; }
1681 #endif
1682                 }
1683                 
1684                 in.close();
1685                 
1686                 return count;
1687         }
1688         catch(exception& e) {
1689                 m->errorOut(e, "ScreenSeqsCommand", "driverAlignSummary");
1690                 exit(1);
1691         }
1692 }
1693
1694 /**************************************************************************************************/
1695 int ScreenSeqsCommand::createProcessesAlignSummary(vector<float>& sims, vector<float>& scores, vector<int>& inserts, vector<linePair> alignLines) {
1696         try {
1697         
1698         int process = 1;
1699                 int num = 0;
1700                 vector<int> processIDS;
1701         
1702 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1703         
1704                 //loop through and create all the processes you want
1705                 while (process != processors) {
1706                         int pid = fork();
1707                         
1708                         if (pid > 0) {
1709                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1710                                 process++;
1711                         }else if (pid == 0){
1712                                 num = driverAlignSummary(sims, scores, inserts, alignLines[process]);
1713                                 
1714                                 //pass numSeqs to parent
1715                                 ofstream out;
1716                                 string tempFile = alignreport + toString(getpid()) + ".num.temp";
1717                                 m->openOutputFile(tempFile, out);
1718                                 
1719                                 out << num << endl;
1720                                 out << sims.size() << endl;
1721                                 for (int k = 0; k < sims.size(); k++)           {               out << sims[k] << '\t';         }  out << endl;
1722                                 for (int k = 0; k < scores.size(); k++)         {               out << scores[k] << '\t';       }  out << endl;
1723                                 for (int k = 0; k < inserts.size(); k++)        {               out << inserts[k] << '\t';      }  out << endl;
1724                                 
1725                                 out.close();
1726                                 
1727                                 exit(0);
1728                         }else { 
1729                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1730                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1731                                 exit(0);
1732                         }
1733                 }
1734                 
1735                 num = driverAlignSummary(sims, scores, inserts, alignLines[0]);
1736                 
1737                 //force parent to wait until all the processes are done
1738                 for (int i=0;i<processIDS.size();i++) { 
1739                         int temp = processIDS[i];
1740                         wait(&temp);
1741                 }
1742                 
1743                 //parent reads in and combine Filter info
1744                 for (int i = 0; i < processIDS.size(); i++) {
1745                         string tempFilename = alignreport + toString(processIDS[i]) + ".num.temp";
1746                         ifstream in;
1747                         m->openInputFile(tempFilename, in);
1748                         
1749                         int temp, tempNum;
1750             float temp2;
1751                         in >> tempNum; m->gobble(in); num += tempNum;
1752                         in >> tempNum; m->gobble(in);
1753                         for (int k = 0; k < tempNum; k++)                       {               in >> temp2; sims.push_back(temp2);             }               m->gobble(in);
1754                         for (int k = 0; k < tempNum; k++)                       {               in >> temp2; scores.push_back(temp2);           }               m->gobble(in);
1755                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; inserts.push_back(temp);    }               m->gobble(in);
1756                           
1757                         in.close();
1758                         m->mothurRemove(tempFilename);
1759                 }
1760                 
1761                 
1762 #else 
1763         //////////////////////////////////////////////////////////////////////////////////////////////////////
1764                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
1765                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1766                 //Taking advantage of shared memory to allow both threads to add info to vectors.
1767                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1768                 /*
1769                 vector<alignsData*> pDataArray; 
1770                 DWORD   dwThreadIdArray[processors-1];
1771                 HANDLE  hThreadArray[processors-1]; 
1772                 
1773                 //Create processor worker threads.
1774                 for( int i=0; i<processors-1; i++ ){
1775             
1776                         // Allocate memory for thread data.
1777                         alignsData* tempSum = new alignsData(alignreport, m, alignLines[i].start, alignLines[i].end, namefile, countfile, nameMap);
1778                         pDataArray.push_back(tempSum);
1779                         
1780                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1781                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1782                         hThreadArray[i] = CreateThread(NULL, 0, MyAlignsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1783                 }*/
1784                 alignLines[processors-1].start = 0;
1785         //do your part
1786                 num = driverAlignSummary(sims, scores, inserts, alignLines[processors-1]);
1787        /*
1788                 //Wait until all threads have terminated.
1789                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1790                 
1791                 //Close all thread handles and free memory allocations.
1792                 for(int i=0; i < pDataArray.size(); i++){
1793                         num += pDataArray[i]->count;
1794             if (pDataArray[i]->count != pDataArray[i]->end) {
1795                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
1796             }
1797             for (int k = 0; k < pDataArray[i]->sims.size(); k++)        {       sims.push_back(pDataArray[i]->sims[k]);         }
1798                         for (int k = 0; k < pDataArray[i]->scores.size(); k++)      {   scores.push_back(pDataArray[i]->scores[k]);     }
1799             for (int k = 0; k < pDataArray[i]->inserts.size(); k++)     {       inserts.push_back(pDataArray[i]->inserts[k]);   }
1800                 CloseHandle(hThreadArray[i]);
1801                         delete pDataArray[i];
1802                 }
1803         */
1804 #endif          
1805         return num;
1806         }
1807         catch(exception& e) {
1808                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesAlignSummary");
1809                 exit(1);
1810         }
1811 }
1812 //***************************************************************************************************************
1813 int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
1814         try {
1815                 
1816                 vector<int> startPosition;
1817                 vector<int> endPosition;
1818                 vector<int> seqLength;
1819                 vector<int> ambigBases;
1820                 vector<int> longHomoPolymer;
1821         vector<int> numNs;
1822                 
1823         vector<unsigned long long> positions;
1824 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1825                 positions = m->divideFile(fastafile, processors);
1826                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }   
1827 #else
1828                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
1829         else {
1830             int numFastaSeqs = 0;
1831             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
1832             if (positions.size() < processors) { processors = positions.size(); }
1833             
1834             //figure out how many sequences you have to process
1835             int numSeqsPerProcessor = numFastaSeqs / processors;
1836             for (int i = 0; i < processors; i++) {
1837                 int startIndex =  i * numSeqsPerProcessor;
1838                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
1839                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
1840             }
1841         }
1842 #endif
1843                 
1844 #ifdef USE_MPI
1845                 int pid;
1846                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1847                 
1848                 if (pid == 0) { 
1849                         driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
1850 #else
1851                 int numSeqs = 0;
1852                 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1853                         if(processors == 1){
1854                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
1855                         }else{
1856                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile); 
1857                         }
1858                                 
1859                         if (m->control_pressed) {  return 0; }
1860 #endif
1861                 sort(startPosition.begin(), startPosition.end());
1862                 sort(endPosition.begin(), endPosition.end());
1863                 sort(seqLength.begin(), seqLength.end());
1864                 sort(ambigBases.begin(), ambigBases.end());
1865                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
1866         sort(numNs.begin(), numNs.end());
1867                 
1868                 //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
1869                 int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
1870                 
1871                 for (int i = 0; i < optimize.size(); i++) {
1872                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
1873                         else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
1874                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
1875                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
1876                         else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
1877                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
1878             else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); }
1879                 }
1880
1881 #ifdef USE_MPI
1882                 }
1883                 
1884                 MPI_Status status; 
1885                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
1886                 MPI_Comm_size(MPI_COMM_WORLD, &processors); 
1887                         
1888                 if (pid == 0) { 
1889                         //send file positions to all processes
1890                         for(int i = 1; i < processors; i++) { 
1891                                 MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1892                                 MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1893                                 MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1894                                 MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1895                                 MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1896                                 MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1897                 MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD);
1898                         }
1899                 }else {
1900                         MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1901                         MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1902                         MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1903                         MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1904                         MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1905                         MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1906             MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
1907                 }
1908                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
1909 #endif
1910                 return 0;
1911         }
1912         catch(exception& e) {
1913                 m->errorOut(e, "ScreenSeqsCommand", "getSummary");
1914                 exit(1);
1915         }
1916 }
1917 /**************************************************************************************/
1918 int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename, linePair filePos) {        
1919         try {
1920                 
1921                 ifstream in;
1922                 m->openInputFile(filename, in);
1923                                 
1924                 in.seekg(filePos.start);
1925
1926                 bool done = false;
1927                 int count = 0;
1928         
1929                 while (!done) {
1930                                 
1931                         if (m->control_pressed) { in.close(); return 1; }
1932                                         
1933                         Sequence current(in); m->gobble(in);
1934         
1935                         if (current.getName() != "") {
1936                                 int num = 1;
1937                                 if ((namefile != "") || (countfile !="")){
1938                                         //make sure this sequence is in the namefile, else error 
1939                                         map<string, int>::iterator it = nameMap.find(current.getName());
1940                                         
1941                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1942                                         else { num = it->second; }
1943                                 }
1944                                 
1945                                 //for each sequence this sequence represents
1946                 int numns = current.getNumNs();
1947                                 for (int i = 0; i < num; i++) {
1948                                         startPosition.push_back(current.getStartPos());
1949                                         endPosition.push_back(current.getEndPos());
1950                                         seqLength.push_back(current.getNumBases());
1951                                         ambigBases.push_back(current.getAmbigBases());
1952                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
1953                     numNs.push_back(numns);
1954                                 }
1955                                 
1956                                 count++;
1957                         }
1958                         //if((count) % 100 == 0){       m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine();         }
1959                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1960                                 unsigned long long pos = in.tellg();
1961                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
1962                         #else
1963                                 if (in.eof()) { break; }
1964                         #endif
1965                         
1966                 }
1967                 
1968                 in.close();
1969                 
1970                 return count;
1971         }
1972         catch(exception& e) {
1973                 m->errorOut(e, "ScreenSeqsCommand", "driverCreateSummary");
1974                 exit(1);
1975         }
1976 }
1977 /**************************************************************************************************/
1978 int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, vector<int>& numNs, string filename) {
1979         try {
1980         
1981         int process = 1;
1982                 int num = 0;
1983                 vector<int> processIDS;
1984
1985 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1986                                 
1987                 //loop through and create all the processes you want
1988                 while (process != processors) {
1989                         int pid = fork();
1990                         
1991                         if (pid > 0) {
1992                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1993                                 process++;
1994                         }else if (pid == 0){
1995                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[process]);
1996                                 
1997                                 //pass numSeqs to parent
1998                                 ofstream out;
1999                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
2000                                 m->openOutputFile(tempFile, out);
2001                                 
2002                                 out << num << endl;
2003                                 out << startPosition.size() << endl;
2004                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
2005                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
2006                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
2007                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
2008                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
2009                 for (int k = 0; k < numNs.size(); k++)  {               out << numNs[k] << '\t'; }  out << endl;
2010                                 
2011                                 out.close();
2012                                 
2013                                 exit(0);
2014                         }else { 
2015                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2016                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2017                                 exit(0);
2018                         }
2019                 }
2020                 
2021                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
2022                 
2023                 //force parent to wait until all the processes are done
2024                 for (int i=0;i<processIDS.size();i++) { 
2025                         int temp = processIDS[i];
2026                         wait(&temp);
2027                 }
2028                 
2029                 //parent reads in and combine Filter info
2030                 for (int i = 0; i < processIDS.size(); i++) {
2031                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
2032                         ifstream in;
2033                         m->openInputFile(tempFilename, in);
2034                         
2035                         int temp, tempNum;
2036                         in >> tempNum; m->gobble(in); num += tempNum;
2037                         in >> tempNum; m->gobble(in);
2038                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
2039                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
2040                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
2041                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
2042                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
2043             for (int k = 0; k < tempNum; k++)                   {               in >> temp; numNs.push_back(temp);      }               m->gobble(in);
2044                                 
2045                         in.close();
2046                         m->mothurRemove(tempFilename);
2047                 }
2048                 
2049                 
2050 #else 
2051         //////////////////////////////////////////////////////////////////////////////////////////////////////
2052                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
2053                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2054                 //Taking advantage of shared memory to allow both threads to add info to vectors.
2055                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2056                 
2057                 vector<sumData*> pDataArray; 
2058                 DWORD   dwThreadIdArray[processors-1];
2059                 HANDLE  hThreadArray[processors-1]; 
2060                 
2061                 //Create processor worker threads.
2062                 for( int i=0; i<processors-1; i++ ){
2063             
2064                         // Allocate memory for thread data.
2065                         sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, countfile, nameMap);
2066                         pDataArray.push_back(tempSum);
2067                         
2068                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
2069                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
2070                         hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2071                 }
2072                 
2073         //do your part
2074                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[processors-1]);
2075          
2076                 //Wait until all threads have terminated.
2077                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2078                 
2079                 //Close all thread handles and free memory allocations.
2080                 for(int i=0; i < pDataArray.size(); i++){
2081                         num += pDataArray[i]->count;
2082             if (pDataArray[i]->count != pDataArray[i]->end) {
2083                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
2084             }
2085             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
2086                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
2087             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
2088             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
2089             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
2090             for (int k = 0; k < pDataArray[i]->numNs.size(); k++) {     numNs.push_back(pDataArray[i]->numNs[k]);       }
2091                         CloseHandle(hThreadArray[i]);
2092                         delete pDataArray[i];
2093                 }
2094
2095 #endif          
2096         return num;
2097         }
2098         catch(exception& e) {
2099                 m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
2100                 exit(1);
2101         }
2102 }
2103
2104 //***************************************************************************************************************
2105
2106 int ScreenSeqsCommand::screenGroupFile(map<string, string> badSeqNames){
2107         try {
2108                 ifstream inputGroups;
2109                 m->openInputFile(groupfile, inputGroups);
2110                 string seqName, group;
2111                 map<string, string>::iterator it;
2112                 map<string, string> variables;
2113                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile));
2114         variables["[extension]"] = m->getExtension(groupfile);
2115         string goodGroupFile = getOutputFileName("group", variables);
2116         outputNames.push_back(goodGroupFile);  outputTypes["group"].push_back(goodGroupFile);
2117                 ofstream goodGroupOut;  m->openOutputFile(goodGroupFile, goodGroupOut);
2118                 
2119                 while(!inputGroups.eof()){
2120                         if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; }
2121
2122                         inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group;
2123                         it = badSeqNames.find(seqName);
2124                         
2125                         if(it != badSeqNames.end()){
2126                                 badSeqNames.erase(it);
2127                         }
2128                         else{
2129                                 goodGroupOut << seqName << '\t' << group << endl;
2130                         }
2131                         m->gobble(inputGroups);
2132                 }
2133                 
2134                 if (m->control_pressed) { goodGroupOut.close();  inputGroups.close(); m->mothurRemove(goodGroupFile);  return 0; }
2135
2136                 //we were unable to remove some of the bad sequences
2137                 if (badSeqNames.size() != 0) {
2138                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2139                                 m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); 
2140                                 m->mothurOutEndLine();
2141                         }
2142                 }
2143                 
2144                 inputGroups.close();
2145                 goodGroupOut.close();
2146                 
2147                 if (m->control_pressed) { m->mothurRemove(goodGroupFile);   }
2148                 
2149                 return 0;
2150         
2151         }
2152         catch(exception& e) {
2153                 m->errorOut(e, "ScreenSeqsCommand", "screenGroupFile");
2154                 exit(1);
2155         }
2156 }
2157 //***************************************************************************************************************
2158 int ScreenSeqsCommand::screenCountFile(map<string, string> badSeqNames){
2159         try {
2160                 ifstream in;
2161                 m->openInputFile(countfile, in);
2162                 map<string, string>::iterator it;
2163                 map<string, string> variables;
2164                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
2165         variables["[extension]"] = m->getExtension(countfile);
2166         string goodCountFile = getOutputFileName("count", variables);
2167                 
2168         outputNames.push_back(goodCountFile);  outputTypes["count"].push_back(goodCountFile);
2169                 ofstream goodCountOut;  m->openOutputFile(goodCountFile, goodCountOut);
2170                 
2171         string headers = m->getline(in); m->gobble(in);
2172         goodCountOut << headers << endl;
2173         
2174         string name, rest; int thisTotal;
2175         while (!in.eof()) {
2176
2177                         if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
2178             
2179                         in >> name; m->gobble(in); 
2180             in >> thisTotal; m->gobble(in);
2181             rest = m->getline(in); m->gobble(in);
2182             
2183                         it = badSeqNames.find(name);
2184                         
2185                         if(it != badSeqNames.end()){
2186                                 badSeqNames.erase(it);
2187                         }
2188                         else{
2189                                 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
2190                         }
2191                 }
2192                 
2193                 if (m->control_pressed) { goodCountOut.close();  in.close(); m->mothurRemove(goodCountFile);  return 0; }
2194         
2195                 //we were unable to remove some of the bad sequences
2196                 if (badSeqNames.size() != 0) {
2197                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2198                                 m->mothurOut("Your count file does not include the sequence " + it->first + " please correct."); 
2199                                 m->mothurOutEndLine();
2200                         }
2201                 }
2202                 
2203                 in.close();
2204                 goodCountOut.close();
2205         
2206         //check for groups that have been eliminated
2207         CountTable ct;
2208         if (ct.testGroups(goodCountFile)) {
2209             ct.readTable(goodCountFile, true);
2210             ct.printTable(goodCountFile);
2211         }
2212                 
2213                 if (m->control_pressed) { m->mothurRemove(goodCountFile);   }
2214                 
2215                 return 0;
2216         
2217         }
2218         catch(exception& e) {
2219                 m->errorOut(e, "ScreenSeqsCommand", "screenCountFile");
2220                 exit(1);
2221         }
2222 }
2223 //***************************************************************************************************************
2224
2225 int ScreenSeqsCommand::screenTaxonomy(map<string, string> badSeqNames){
2226         try {
2227                 ifstream input;
2228                 m->openInputFile(taxonomy, input);
2229                 string seqName, tax;
2230                 map<string, string>::iterator it;
2231         map<string, string> variables;
2232                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy));
2233         variables["[extension]"] = m->getExtension(taxonomy);
2234         string goodTaxFile = getOutputFileName("taxonomy", variables);
2235
2236                 outputNames.push_back(goodTaxFile);  outputTypes["taxonomy"].push_back(goodTaxFile);
2237                 ofstream goodTaxOut;    m->openOutputFile(goodTaxFile, goodTaxOut);
2238                                 
2239                 while(!input.eof()){
2240                         if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
2241                         
2242                         input >> seqName; m->gobble(input); input >> tax;
2243                         it = badSeqNames.find(seqName);
2244                         
2245                         if(it != badSeqNames.end()){ badSeqNames.erase(it); }
2246                         else{
2247                                 goodTaxOut << seqName << '\t' << tax << endl;
2248                         }
2249                         m->gobble(input);
2250                 }
2251                 
2252                 if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; }
2253                 
2254                 //we were unable to remove some of the bad sequences
2255                 if (badSeqNames.size() != 0) {
2256                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2257                                 m->mothurOut("Your taxonomy file does not include the sequence " + it->first + " please correct."); 
2258                                 m->mothurOutEndLine();
2259                         }
2260                 }
2261                 
2262                 input.close();
2263                 goodTaxOut.close();
2264                 
2265                 if (m->control_pressed) {  m->mothurRemove(goodTaxFile);  return 0; }
2266                 
2267                 return 0;
2268                 
2269         }
2270         catch(exception& e) {
2271                 m->errorOut(e, "ScreenSeqsCommand", "screenTaxonomy");
2272                 exit(1);
2273         }
2274         
2275 }
2276 //***************************************************************************************************************
2277
2278 int ScreenSeqsCommand::screenQual(map<string, string> badSeqNames){
2279         try {
2280                 ifstream in;
2281                 m->openInputFile(qualfile, in);
2282                 map<string, string>::iterator it;
2283                 map<string, string> variables;
2284                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile));
2285         variables["[extension]"] = m->getExtension(qualfile);
2286         string goodQualFile = getOutputFileName("qfile", variables);
2287                 
2288                 outputNames.push_back(goodQualFile);  outputTypes["qfile"].push_back(goodQualFile);
2289                 ofstream goodQual;      m->openOutputFile(goodQualFile, goodQual);
2290                 
2291                 while(!in.eof()){       
2292                         
2293                         if (m->control_pressed) { goodQual.close(); in.close(); m->mothurRemove(goodQualFile); return 0; }
2294
2295                         string saveName = "";
2296                         string name = "";
2297                         string scores = "";
2298                         
2299                         in >> name; 
2300                         
2301                         if (name.length() != 0) { 
2302                                 saveName = name.substr(1);
2303                                 while (!in.eof())       {       
2304                                         char c = in.get(); 
2305                                         if (c == 10 || c == 13 || c == -1){     break;  }
2306                                         else { name += c; }     
2307                                 } 
2308                                 m->gobble(in);
2309                         }
2310                         
2311                         while(in){
2312                                 char letter= in.get();
2313                                 if(letter == '>'){      in.putback(letter);     break;  }
2314                                 else{ scores += letter; }
2315                         }
2316                         
2317                         m->gobble(in);
2318                         
2319                         it = badSeqNames.find(saveName);
2320                         
2321                         if(it != badSeqNames.end()){
2322                                 badSeqNames.erase(it);
2323                         }else{                          
2324                                 goodQual << name << endl << scores;
2325                         }
2326                         
2327                         m->gobble(in);
2328                 }
2329                 
2330                 in.close();
2331                 goodQual.close();
2332                 
2333                 //we were unable to remove some of the bad sequences
2334                 if (badSeqNames.size() != 0) {
2335                         for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
2336                                 m->mothurOut("Your qual file does not include the sequence " + it->first + " please correct."); 
2337                                 m->mothurOutEndLine();
2338                         }
2339                 }
2340                 
2341                 if (m->control_pressed) {  m->mothurRemove(goodQualFile);  return 0; }
2342                 
2343                 return 0;
2344                 
2345         }
2346         catch(exception& e) {
2347                 m->errorOut(e, "ScreenSeqsCommand", "screenQual");
2348                 exit(1);
2349         }
2350         
2351 }
2352 //**********************************************************************************************************************
2353
2354 int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, map<string, string>& badSeqNames){
2355         try {
2356                 ofstream goodFile;
2357                 m->openOutputFile(goodFName, goodFile);
2358                 
2359                 ofstream badAccnosFile;
2360                 m->openOutputFile(badAccnosFName, badAccnosFile);
2361                 
2362                 ifstream inFASTA;
2363                 m->openInputFile(filename, inFASTA);
2364
2365                 inFASTA.seekg(filePos.start);
2366
2367                 bool done = false;
2368                 int count = 0;
2369         
2370                 while (!done) {
2371                 
2372                         if (m->control_pressed) {  return 0; }
2373                         
2374                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
2375                         if (currSeq.getName() != "") {
2376                                 bool goodSeq = 1;               //      innocent until proven guilty
2377                 string trashCode = "";
2378                 //have the report files found you bad
2379                 map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
2380                 if (it != badSeqNames.end()) { goodSeq = 0;  trashCode = it->second; }  
2381                 
2382                 if (summaryfile == "") { //summaryfile includes these so no need to check again
2383                     if(startPos != -1 && startPos < currSeq.getStartPos())                      {       goodSeq = 0;    trashCode += "start|"; }
2384                     if(endPos != -1 && endPos > currSeq.getEndPos())                            {       goodSeq = 0;    trashCode += "end|";}
2385                     if(maxAmbig != -1 && maxAmbig <     currSeq.getAmbigBases())                {       goodSeq = 0;    trashCode += "ambig|";}
2386                     if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())       {       goodSeq = 0;    trashCode += "homop|";}
2387                     if(minLength != -1 && minLength > currSeq.getNumBases())            {       goodSeq = 0;    trashCode += "<length|";}
2388                     if(maxLength != -1 && maxLength < currSeq.getNumBases())            {       goodSeq = 0;    trashCode += ">length|";}
2389                 }
2390                 
2391                 if (contigsreport == "") { //contigs report includes this so no need to check again
2392                     if(maxN != -1 && maxN < currSeq.getNumNs())                     {   goodSeq = 0;    trashCode += "n|"; }
2393                 }
2394                                 
2395                                 if(goodSeq == 1){
2396                                         currSeq.printSequence(goodFile);        
2397                                 }else{
2398                                         badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
2399                                         badSeqNames[currSeq.getName()] = trashCode;
2400                                 }
2401                 count++;
2402                         }
2403                         
2404                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
2405                                 unsigned long long pos = inFASTA.tellg();
2406                                 if ((pos == -1) || (pos >= filePos.end)) { break; }
2407                         #else
2408                                 if (inFASTA.eof()) { break; }
2409                         #endif
2410                         
2411                         //report progress
2412                         if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n");               }
2413                 }
2414                 //report progress
2415                 if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n");       }
2416                 
2417                         
2418                 goodFile.close();
2419                 inFASTA.close();
2420                 badAccnosFile.close();
2421                 
2422                 return count;
2423         }
2424         catch(exception& e) {
2425                 m->errorOut(e, "ScreenSeqsCommand", "driver");
2426                 exit(1);
2427         }
2428 }
2429 //**********************************************************************************************************************
2430 #ifdef USE_MPI
2431 int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, map<string, string>& badSeqNames){
2432         try {
2433                 string outputString = "";
2434                 MPI_Status statusGood; 
2435                 MPI_Status statusBadAccnos; 
2436                 MPI_Status status; 
2437                 int pid;
2438                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
2439
2440                 for(int i=0;i<num;i++){
2441                 
2442                         if (m->control_pressed) {  return 0; }
2443                         
2444                         //read next sequence
2445                         int length = MPIPos[start+i+1] - MPIPos[start+i];
2446
2447                         char* buf4 = new char[length];
2448
2449                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
2450                         
2451                         string tempBuf = buf4;  delete buf4;
2452                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
2453                         istringstream iss (tempBuf,istringstream::in);
2454                         
2455                         Sequence currSeq(iss);                  
2456                         
2457                         //process seq
2458                         if (currSeq.getName() != "") {
2459                                 bool goodSeq = 1;               //      innocent until proven guilty
2460                 string trashCode = "";
2461                 //have the report files found you bad
2462                 map<string, string>::iterator it = badSeqNames.find(currSeq.getName());
2463                 if (it != badSeqNames.end()) { goodSeq = 0;  trashCode = it->second; }  
2464                 
2465                 if (summaryfile == "") { //summaryfile includes these so no need to check again
2466                     if(startPos != -1 && startPos < currSeq.getStartPos())                      {       goodSeq = 0;    trashCode += "start|"; }
2467                     if(endPos != -1 && endPos > currSeq.getEndPos())                            {       goodSeq = 0;    trashCode += "end|";}
2468                     if(maxAmbig != -1 && maxAmbig <     currSeq.getAmbigBases())                {       goodSeq = 0;    trashCode += "ambig|";}
2469                     if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())       {       goodSeq = 0;    trashCode += "homop|";}
2470                     if(minLength != -1 && minLength > currSeq.getNumBases())            {       goodSeq = 0;    trashCode += "<length|";}
2471                     if(maxLength != -1 && maxLength < currSeq.getNumBases())            {       goodSeq = 0;    trashCode += ">length|";}
2472                 }
2473                 
2474                 if (contigsreport == "") { //contigs report includes this so no need to check again
2475                     if(maxN != -1 && maxN < currSeq.getNumNs())                     {   goodSeq = 0;    trashCode += "n|"; }
2476                 }
2477                                 
2478                 
2479                                 if(goodSeq == 1){
2480                                         outputString =  ">" + currSeq.getName() + "\n" + currSeq.getAligned() + "\n";
2481                                 
2482                                         //print good seq
2483                                         length = outputString.length();
2484                                         char* buf2 = new char[length];
2485                                         memcpy(buf2, outputString.c_str(), length);
2486                                         
2487                                         MPI_File_write_shared(goodFile, buf2, length, MPI_CHAR, &statusGood);
2488                                         delete buf2;
2489                                 }
2490                                 else{
2491
2492                                         badSeqNames[currSeq.getName()] = trashCode;
2493                                         
2494                                         //write to bad accnos file
2495                                         outputString = currSeq.getName() + "\t" + trashCode.substr(0, trashCode.length()-1) + "\n";
2496                                 
2497                                         length = outputString.length();
2498                                         char* buf3 = new char[length];
2499                                         memcpy(buf3, outputString.c_str(), length);
2500                                         
2501                                         MPI_File_write_shared(badAccnosFile, buf3, length, MPI_CHAR, &statusBadAccnos);
2502                                         delete buf3;
2503                                 }
2504                         }
2505                         
2506                         //report progress
2507                         if((i) % 100 == 0){     m->mothurOutJustToScreen("Processing sequence: " + toString(i)+"\n");           }
2508                 }
2509                                 
2510                 return 1;
2511         }
2512         catch(exception& e) {
2513                 m->errorOut(e, "ScreenSeqsCommand", "driverMPI");
2514                 exit(1);
2515         }
2516 }
2517 #endif
2518 /**************************************************************************************************/
2519
2520 int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, map<string, string>& badSeqNames) {
2521         try {
2522         
2523         vector<int> processIDS;   
2524         int process = 1;
2525                 int num = 0;
2526
2527 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
2528                                 
2529                 //loop through and create all the processes you want
2530                 while (process != processors) {
2531                         int pid = fork();
2532                         
2533                         if (pid > 0) {
2534                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2535                                 process++;
2536                         }else if (pid == 0){
2537                                 num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames);
2538                                 
2539                                 //pass numSeqs to parent
2540                                 ofstream out;
2541                                 string tempFile = filename + toString(getpid()) + ".num.temp";
2542                                 m->openOutputFile(tempFile, out);
2543                                 out << num << endl;
2544                                 out.close();
2545                                 
2546                                 exit(0);
2547                         }else { 
2548                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2549                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2550                                 exit(0);
2551                         }
2552                 }
2553                 
2554         num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
2555         
2556                 //force parent to wait until all the processes are done
2557                 for (int i=0;i<processIDS.size();i++) { 
2558                         int temp = processIDS[i];
2559                         wait(&temp);
2560                 }
2561                 
2562                 for (int i = 0; i < processIDS.size(); i++) {
2563                         ifstream in;
2564                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
2565                         m->openInputFile(tempFile, in);
2566                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
2567                         in.close(); m->mothurRemove(tempFile);
2568             
2569             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
2570             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
2571                         
2572             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
2573             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
2574                 }
2575                 
2576         //read badSeqs in because root process doesnt know what other "bad" seqs the children found
2577         ifstream inBad;
2578         int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
2579         
2580         if (ableToOpen == 0) {
2581             badSeqNames.clear();
2582             string tempName, trashCode;
2583             while (!inBad.eof()) {
2584                 inBad >> tempName >> trashCode; m->gobble(inBad);
2585                 badSeqNames[tempName] = trashCode;
2586             }
2587             inBad.close();
2588         }
2589 #else
2590         
2591         //////////////////////////////////////////////////////////////////////////////////////////////////////
2592                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
2593                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2594                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
2595                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2596                 
2597                 vector<sumScreenData*> pDataArray; 
2598                 DWORD   dwThreadIdArray[processors-1];
2599                 HANDLE  hThreadArray[processors-1]; 
2600                 
2601                 //Create processor worker threads.
2602                 for( int i=0; i<processors-1; i++ ){
2603             
2604             string extension = "";
2605             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
2606             
2607                         // Allocate memory for thread data.
2608                         sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN, badSeqNames, filename, summaryfile, contigsreport, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
2609                         pDataArray.push_back(tempSum);
2610                         
2611                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
2612                         hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2613                 }
2614                 
2615         //do your part
2616         num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
2617         processIDS.push_back(processors-1);
2618         
2619                 //Wait until all threads have terminated.
2620                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2621                 
2622                 //Close all thread handles and free memory allocations.
2623                 for(int i=0; i < pDataArray.size(); i++){
2624                         num += pDataArray[i]->count;
2625             if (pDataArray[i]->count != pDataArray[i]->end) {
2626                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
2627             }
2628             for (map<string, string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames[it->first] = it->second;       }
2629                         CloseHandle(hThreadArray[i]);
2630                         delete pDataArray[i];
2631                 }
2632         
2633         for (int i = 0; i < processIDS.size(); i++) {
2634             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
2635             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
2636                         
2637             m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
2638             m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
2639                 }
2640
2641 #endif  
2642         
2643         return num;
2644         
2645         }
2646         catch(exception& e) {
2647                 m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
2648                 exit(1);
2649         }
2650 }
2651
2652 //***************************************************************************************************************
2653
2654