]> git.donarmstrong.com Git - mothur.git/blob - prcseqscommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / prcseqscommand.cpp
1 //
2 //  prcseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/14/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "pcrseqscommand.h"
10
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){ 
13         try {
14                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15                 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17         CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18         CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19         CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22                 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
24                 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
25         CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
26         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28         
29                 vector<string> myArray;
30                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
31                 return myArray;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "PcrSeqsCommand", "setParameters");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 string PcrSeqsCommand::getHelpString(){ 
40         try {
41                 string helpString = "";
42                 helpString += "The pcr.seqs command reads a fasta file.\n";
43         helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
44                 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
45         helpString += "The start parameter allows you to provide a starting position to trim to.\n";
46         helpString += "The end parameter allows you to provide a ending position to trim from.\n";
47         helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file.  if nomatch=true, then do nothing to sequence.\n";
48         helpString += "The processors parameter allows you to use multiple processors.\n";
49         helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
50         helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
51                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
53                 return helpString;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
57                 exit(1);
58         }
59 }
60
61
62 //**********************************************************************************************************************
63
64 PcrSeqsCommand::PcrSeqsCommand(){       
65         try {
66                 abort = true; calledHelp = true; 
67                 setParameters();
68                 vector<string> tempOutNames;
69                 outputTypes["fasta"] = tempOutNames;
70                 outputTypes["taxonomy"] = tempOutNames;
71                 outputTypes["group"] = tempOutNames;
72                 outputTypes["name"] = tempOutNames;
73         outputTypes["accnos"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
77                 exit(1);
78         }
79 }
80 //***************************************************************************************************************
81
82 PcrSeqsCommand::PcrSeqsCommand(string option)  {
83         try {
84                 
85                 abort = false; calledHelp = false;   
86                 
87                 //allow user to run help
88                 if(option == "help") { help(); abort = true; calledHelp = true; }
89                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
90                 
91                 else {
92                         vector<string> myArray = setParameters();
93                         
94                         OptionParser parser(option);
95                         map<string,string> parameters = parser.getParameters();
96                         
97                         ValidParameters validParameter;
98                         map<string,string>::iterator it;
99                         
100                         //check to make sure all parameters are valid for command
101                         for (it = parameters.begin(); it != parameters.end(); it++) { 
102                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
103                         }
104                         
105                         //initialize outputTypes
106                         vector<string> tempOutNames;
107                         outputTypes["fasta"] = tempOutNames;
108                         outputTypes["taxonomy"] = tempOutNames;
109                         outputTypes["group"] = tempOutNames;
110                         outputTypes["name"] = tempOutNames;
111             outputTypes["accnos"] = tempOutNames;
112                         
113                         //if the user changes the input directory command factory will send this info to us in the output parameter 
114                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
115                         if (inputDir == "not found"){   inputDir = "";          }
116                         else {
117                                 string path;
118                                 it = parameters.find("fasta");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
124                                 }
125                                 
126                                 it = parameters.find("oligos");
127                                 //user has given a template file
128                                 if(it != parameters.end()){ 
129                                         path = m->hasPath(it->second);
130                                         //if the user has not given a path then, add inputdir. else leave path alone.
131                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
132                                 }
133                 
134                 it = parameters.find("ecoli");
135                                 //user has given a template file
136                                 if(it != parameters.end()){ 
137                                         path = m->hasPath(it->second);
138                                         //if the user has not given a path then, add inputdir. else leave path alone.
139                                         if (path == "") {       parameters["ecoli"] = inputDir + it->second;            }
140                                 }
141                                 
142                                 it = parameters.find("taxonomy");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
148                                 }
149                                 
150                                 it = parameters.find("name");
151                                 //user has given a template file
152                                 if(it != parameters.end()){ 
153                                         path = m->hasPath(it->second);
154                                         //if the user has not given a path then, add inputdir. else leave path alone.
155                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
156                                 }
157                 
158                 it = parameters.find("group");
159                                 //user has given a template file
160                                 if(it != parameters.end()){ 
161                                         path = m->hasPath(it->second);
162                                         //if the user has not given a path then, add inputdir. else leave path alone.
163                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
164                                 }
165                                 
166                         }
167             
168                         
169                         //check for required parameters
170                         fastafile = validParameter.validFile(parameters, "fasta", true);
171                         if (fastafile == "not found") {                                 
172                                 fastafile = m->getFastaFile(); 
173                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
174                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
175                         }else if (fastafile == "not open") { fastafile = ""; abort = true; }    
176                         else { m->setFastaFile(fastafile); }
177                         
178             //if the user changes the output directory command factory will send this info to us in the output parameter 
179                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(fastafile);      }
180
181                         //check for optional parameter and set defaults
182                         // ...at some point should added some additional type checking...
183                         string temp;
184                         temp = validParameter.validFile(parameters, "keepprimer", false);  if (temp == "not found")    {        temp = "f";     }
185                         keepprimer = m->isTrue(temp);   
186             
187             temp = validParameter.validFile(parameters, "keepdots", false);  if (temp == "not found")    {      temp = "t";     }
188                         keepdots = m->isTrue(temp);     
189             
190                         temp = validParameter.validFile(parameters, "oligos", true);
191                         if (temp == "not found"){       oligosfile = "";                }
192                         else if(temp == "not open"){    oligosfile = ""; abort = true;  } 
193                         else                                    {       oligosfile = temp; m->setOligosFile(oligosfile);                }
194                         
195             ecolifile = validParameter.validFile(parameters, "ecoli", true);
196                         if (ecolifile == "not found"){  ecolifile = "";         }
197                         else if(ecolifile == "not open"){       ecolifile = ""; abort = true;   } 
198                         
199             namefile = validParameter.validFile(parameters, "name", true);
200                         if (namefile == "not found"){   namefile = "";          }
201                         else if(namefile == "not open"){        namefile = ""; abort = true;    } 
202             else { m->setNameFile(namefile); }
203             
204             groupfile = validParameter.validFile(parameters, "group", true);
205                         if (groupfile == "not found"){  groupfile = "";         }
206                         else if(groupfile == "not open"){       groupfile = ""; abort = true;   } 
207             else { m->setGroupFile(groupfile); }
208             
209             taxfile = validParameter.validFile(parameters, "taxonomy", true);
210                         if (taxfile == "not found"){    taxfile = "";           }
211                         else if(taxfile == "not open"){ taxfile = ""; abort = true;     } 
212             else { m->setTaxonomyFile(taxfile); }
213                                                 
214                         temp = validParameter.validFile(parameters, "start", false);    if (temp == "not found") { temp = "-1"; }
215                         m->mothurConvert(temp, start);
216             
217             temp = validParameter.validFile(parameters, "end", false);  if (temp == "not found") { temp = "-1"; }
218                         m->mothurConvert(temp, end);
219                         
220                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
221                         m->setProcessors(temp);
222                         m->mothurConvert(temp, processors); 
223                         
224             nomatch = validParameter.validFile(parameters, "nomatch", false);   if (nomatch == "not found") { nomatch = "reject"; }
225                         
226             if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n");  abort = true; }
227             
228             //didnt set anything
229                         if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
230                 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
231             }
232             
233             if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
234             
235             if ((ecolifile != "") && (start != -1) && (end != -1)) {
236                 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend.  When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
237             }
238
239             
240             if ((oligosfile != "") && (ecolifile != "")) {
241                  m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
242             }
243                         
244                         //check to make sure you didn't forget the name file by mistake                 
245                         if (namefile == "") {
246                                 vector<string> files; files.push_back(fastafile);
247                                 parser.getNameFile(files);
248                         }
249                 }
250         
251         }
252         catch(exception& e) {
253                 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
254                 exit(1);
255         }
256 }
257 //***************************************************************************************************************
258
259 int PcrSeqsCommand::execute(){
260         try{
261         
262                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
263                 
264         int start = time(NULL);
265         
266         string thisOutputDir = outputDir;
267                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
268                 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
269                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
270         
271         string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
272                 
273                 
274         length = 0;
275                 if(oligosfile != ""){    readOligos();     }  if (m->control_pressed) {  return 0; }
276         if(ecolifile != "") {    readEcoli();      }  if (m->control_pressed) {  return 0; }
277         
278         vector<unsigned long long> positions; 
279         int numFastaSeqs = 0;
280 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
281         positions = m->divideFile(fastafile, processors);
282         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(linePair(positions[i], positions[(i+1)]));      }
283 #else
284         if (processors == 1) {
285             lines.push_back(linePair(0, 1000));
286         }else {
287             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
288             if (positions.size() < processors) { processors = positions.size(); }
289             
290             //figure out how many sequences you have to process
291             int numSeqsPerProcessor = numFastaSeqs / processors;
292             for (int i = 0; i < processors; i++) {
293                 int startIndex =  i * numSeqsPerProcessor;
294                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
295                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
296             }
297         }
298 #endif
299         if (m->control_pressed) {  return 0; }
300
301         set<string> badNames;
302         if(processors == 1) {    numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]);   }
303         else                {    numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);       }  
304                 
305                 if (m->control_pressed) {  return 0; }          
306         
307         //don't write or keep if blank
308         if (badNames.size() != 0)   { writeAccnos(badNames);        }   
309         if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile);  }
310         else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
311         
312         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
313         if (namefile != "")                     {               readName(badNames);             }   
314         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
315         if (groupfile != "")            {               readGroup(badNames);    }
316         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
317                 if (taxfile != "")                      {               readTax(badNames);              }
318                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
319         
320         m->mothurOutEndLine();
321                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
322                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
323                 m->mothurOutEndLine();
324                 m->mothurOutEndLine();
325                 
326                 //set fasta file as new current fastafile
327                 string current = "";
328                 itTypes = outputTypes.find("fasta");
329                 if (itTypes != outputTypes.end()) {
330                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
331                 }
332                 
333                 itTypes = outputTypes.find("name");
334                 if (itTypes != outputTypes.end()) {
335                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
336                 }
337                 
338                 itTypes = outputTypes.find("group");
339                 if (itTypes != outputTypes.end()) {
340                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
341                 }
342                 
343                 itTypes = outputTypes.find("accnos");
344                 if (itTypes != outputTypes.end()) {
345                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
346                 }
347                 
348                 itTypes = outputTypes.find("taxonomy");
349                 if (itTypes != outputTypes.end()) {
350                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
351                 }
352         
353                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
354                 m->mothurOutEndLine();
355
356                 
357                 return 0;       
358         
359         }
360         catch(exception& e) {
361                 m->errorOut(e, "PcrSeqsCommand", "execute");
362                 exit(1);
363         }
364 }
365 /**************************************************************************************************/
366 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
367         try {
368         
369         vector<int> processIDS;   
370         int process = 1;
371                 int num = 0;
372         
373 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
374         
375                 //loop through and create all the processes you want
376                 while (process != processors) {
377                         int pid = fork();
378                         
379                         if (pid > 0) {
380                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
381                                 process++;
382                         }else if (pid == 0){
383                                 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
384                                 
385                                 //pass numSeqs to parent
386                                 ofstream out;
387                                 string tempFile = filename + toString(getpid()) + ".num.temp";
388                                 m->openOutputFile(tempFile, out);
389                                 out << num << '\t' << badSeqNames.size() << endl;
390                 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
391                     out << (*it) << endl;
392                 }
393                                 out.close();
394                                 
395                                 exit(0);
396                         }else { 
397                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
398                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
399                                 exit(0);
400                         }
401                 }
402                 
403         num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
404         
405                 //force parent to wait until all the processes are done
406                 for (int i=0;i<processIDS.size();i++) { 
407                         int temp = processIDS[i];
408                         wait(&temp);
409                 }
410                 
411                 for (int i = 0; i < processIDS.size(); i++) {
412                         ifstream in;
413                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
414                         m->openInputFile(tempFile, in);
415             int numBadNames = 0; string name = "";
416                         if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
417             for (int j = 0; j < numBadNames; j++) {
418                 in >> name; m->gobble(in);
419                 badSeqNames.insert(name);
420             }
421                         in.close(); m->mothurRemove(tempFile);
422             
423             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
424             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
425             
426             m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
427             m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
428                 }
429     #else
430         
431         //////////////////////////////////////////////////////////////////////////////////////////////////////
432                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
433                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
434                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
435                 //////////////////////////////////////////////////////////////////////////////////////////////////////
436                 
437                 vector<pcrData*> pDataArray; 
438                 DWORD   dwThreadIdArray[processors-1];
439                 HANDLE  hThreadArray[processors-1]; 
440                 
441                 //Create processor worker threads.
442                 for( int i=0; i<processors-1; i++ ){
443             
444             string extension = "";
445             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
446             
447                         // Allocate memory for thread data.
448                         pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
449                         pDataArray.push_back(tempPcr);
450                         
451                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
452                         hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
453                 }
454                 
455         //do your part
456         num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
457         processIDS.push_back(processors-1);
458         
459                 //Wait until all threads have terminated.
460                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
461                 
462                 //Close all thread handles and free memory allocations.
463                 for(int i=0; i < pDataArray.size(); i++){
464                         num += pDataArray[i]->count;
465             for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it);       }
466                         CloseHandle(hThreadArray[i]);
467                         delete pDataArray[i];
468                 }
469         
470         for (int i = 0; i < processIDS.size(); i++) {
471             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
472             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
473             
474             m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
475             m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
476                 }
477         
478 #endif  
479         
480         return num;
481         
482         }
483         catch(exception& e) {
484                 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
485                 exit(1);
486         }
487 }
488
489 //**********************************************************************************************************************
490 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
491         try {
492                 ofstream goodFile;
493                 m->openOutputFile(goodFasta, goodFile);
494         
495         ofstream badFile;
496                 m->openOutputFile(badFasta, badFile);
497                 
498                 ifstream inFASTA;
499                 m->openInputFile(filename, inFASTA);
500         
501                 inFASTA.seekg(filePos.start);
502         
503                 bool done = false;
504                 int count = 0;
505         set<int> lengths;
506         
507                 while (!done) {
508             
509                         if (m->control_pressed) {  break; }
510                         
511                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
512             
513             string trashCode = "";
514                         if (currSeq.getName() != "") {
515                 
516                 bool goodSeq = true;
517                 if (oligosfile != "") {
518                     map<int, int> mapAligned;
519                     bool aligned = isAligned(currSeq.getAligned(), mapAligned);
520                     
521                     //process primers
522                     if (primers.size() != 0) {
523                         int primerStart = 0; int primerEnd = 0;
524                         bool good = findForward(currSeq, primerStart, primerEnd);
525                         
526                         if(!good){      if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
527                         else{
528                             //are you aligned
529                             if (aligned) { 
530                                 if (!keepprimer)    {  
531                                     if (keepdots)   { currSeq.filterToPos(mapAligned[primerEnd]);   }
532                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd]));                                              }
533                                 } 
534                                 else                {  
535                                     if (keepdots)   { currSeq.filterToPos(mapAligned[primerStart]);  }
536                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));                                              }
537                                 }
538                             }else { 
539                                 if (!keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
540                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
541                             }
542                         }
543                     }
544                     
545                     //process reverse primers
546                     if (revPrimer.size() != 0) {
547                         int primerStart = 0; int primerEnd = 0;
548                         bool good = findReverse(currSeq, primerStart, primerEnd);
549                         if(!good){      if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
550                         else{ 
551                             //are you aligned
552                             if (aligned) { 
553                                 if (!keepprimer)    {  
554                                     if (keepdots)   { currSeq.filterFromPos(mapAligned[primerStart]); }
555                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));   }
556                                 } 
557                                 else                {  
558                                     if (keepdots)   { currSeq.filterFromPos(mapAligned[primerEnd]); }
559                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd]));   }
560                                 } 
561                             }
562                             else { 
563                                 if (!keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
564                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
565                             }
566                         }
567                     }
568                 }else if (ecolifile != "") {
569                     //make sure the seqs are aligned
570                     lengths.insert(currSeq.getAligned().length());
571                     if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
572                     else if (currSeq.getAligned().length() != length) {
573                         m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break; 
574                     }else {
575                         if (keepdots)   { 
576                             currSeq.filterToPos(start); 
577                             currSeq.filterFromPos(end);
578                         }else {
579                             string seqString = currSeq.getAligned().substr(0, end);
580                             seqString = seqString.substr(start);
581                             currSeq.setAligned(seqString); 
582                         }
583                     }
584                 }else{ //using start and end to trim
585                     //make sure the seqs are aligned
586                     lengths.insert(currSeq.getAligned().length());
587                     if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
588                     else {
589                         if (end != -1) {
590                             if (end > currSeq.getAligned().length()) {  m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
591                             else {
592                                 if (keepdots)   { currSeq.filterFromPos(end); }
593                                 else {
594                                     string seqString = currSeq.getAligned().substr(0, end);
595                                     currSeq.setAligned(seqString); 
596                                 }
597                             }
598                         }
599                         if (start != -1) { 
600                             if (keepdots)   {  currSeq.filterToPos(start);  }
601                             else {
602                                 string seqString = currSeq.getAligned().substr(start);
603                                 currSeq.setAligned(seqString); 
604                             }
605                         }
606                     }
607                 }
608                 
609                 //trimming removed all bases
610                 if (currSeq.getUnaligned() == "") { goodSeq = false; }
611                 
612                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
613                                 else {  
614                     badSeqNames.insert(currSeq.getName()); 
615                     currSeq.setName(currSeq.getName() + '|' + trashCode);
616                     currSeq.printSequence(badFile); 
617                 }
618                 count++;
619                         }
620                         
621 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
622             unsigned long long pos = inFASTA.tellg();
623             if ((pos == -1) || (pos >= filePos.end)) { break; }
624 #else
625             if (inFASTA.eof()) { break; }
626 #endif
627                         
628                         //report progress
629                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
630                 }
631                 //report progress
632                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
633                 
634         badFile.close();
635                 goodFile.close();
636                 inFASTA.close();
637                 
638                 return count;
639         }
640         catch(exception& e) {
641                 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
642                 exit(1);
643         }
644 }
645 //********************************************************************/
646 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
647         try {
648                 
649                 string rawSequence = seq.getUnaligned();
650                 
651                 for(int j=0;j<primers.size();j++){
652                         string oligo = primers[j];
653                         
654                         if(rawSequence.length() < oligo.length()) {  break;  }
655                         
656                         //search for primer
657             int olength = oligo.length();
658             for (int j = 0; j < rawSequence.length()-olength; j++){
659                 if (m->control_pressed) {  primerStart = 0; primerEnd = 0; return false; }
660                 string rawChunk = rawSequence.substr(j, olength);
661                 if(compareDNASeq(oligo, rawChunk)) {
662                     primerStart = j;
663                     primerEnd = primerStart + olength;
664                     return true;
665                 }
666                 
667             }
668         }       
669                 
670         primerStart = 0; primerEnd = 0;
671                 return false;
672                 
673         }
674         catch(exception& e) {
675                 m->errorOut(e, "TrimOligos", "stripForward");
676                 exit(1);
677         }
678 }
679 //******************************************************************/
680 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
681         try {
682                 
683                 string rawSequence = seq.getUnaligned();
684                 
685                 for(int i=0;i<revPrimer.size();i++){
686                         string oligo = revPrimer[i];
687                         if(rawSequence.length() < oligo.length()) {  break;  }
688                         
689                         //search for primer
690             int olength = oligo.length();
691             for (int j = rawSequence.length()-olength; j >= 0; j--){
692                  if (m->control_pressed) {  primerStart = 0; primerEnd = 0; return false; }
693                 string rawChunk = rawSequence.substr(j, olength);
694             
695                 if(compareDNASeq(oligo, rawChunk)) {
696                     primerStart = j;
697                     primerEnd = primerStart + olength;
698                     return true;
699                 }
700                 
701             }
702                 }       
703                 
704         primerStart = 0; primerEnd = 0;
705                 return false;
706         }
707         catch(exception& e) {
708                 m->errorOut(e, "PcrSeqsCommand", "findReverse");
709                 exit(1);
710         }
711 }
712 //********************************************************************/
713 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
714         try {
715         bool isAligned = false;
716         
717         int countBases = 0;
718         for (int i = 0; i < seq.length(); i++) {
719             if (!isalpha(seq[i])) { isAligned = true; }
720             else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
721         }                                                   //ie. the 3rd base may be at spot 10 in the alignment
722                                                             //later when we trim we want to trim from spot 10.
723         return isAligned;
724     }
725         catch(exception& e) {
726                 m->errorOut(e, "PcrSeqsCommand", "isAligned");
727                 exit(1);
728         }
729 }
730 //********************************************************************/
731 string PcrSeqsCommand::reverseOligo(string oligo){
732         try {
733         string reverse = "";
734        
735         for(int i=oligo.length()-1;i>=0;i--){
736             
737             if(oligo[i] == 'A')         {       reverse += 'T'; }
738             else if(oligo[i] == 'T'){   reverse += 'A'; }
739             else if(oligo[i] == 'U'){   reverse += 'A'; }
740             
741             else if(oligo[i] == 'G'){   reverse += 'C'; }
742             else if(oligo[i] == 'C'){   reverse += 'G'; }
743             
744             else if(oligo[i] == 'R'){   reverse += 'Y'; }
745             else if(oligo[i] == 'Y'){   reverse += 'R'; }
746             
747             else if(oligo[i] == 'M'){   reverse += 'K'; }
748             else if(oligo[i] == 'K'){   reverse += 'M'; }
749             
750             else if(oligo[i] == 'W'){   reverse += 'W'; }
751             else if(oligo[i] == 'S'){   reverse += 'S'; }
752             
753             else if(oligo[i] == 'B'){   reverse += 'V'; }
754             else if(oligo[i] == 'V'){   reverse += 'B'; }
755             
756             else if(oligo[i] == 'D'){   reverse += 'H'; }
757             else if(oligo[i] == 'H'){   reverse += 'D'; }
758
759             else                                                {       reverse += 'N'; }
760         }
761
762         
763         return reverse;
764     }
765         catch(exception& e) {
766                 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
767                 exit(1);
768         }
769 }
770
771 //***************************************************************************************************************
772 bool PcrSeqsCommand::readOligos(){
773         try {
774                 ifstream inOligos;
775                 m->openInputFile(oligosfile, inOligos);
776                 
777                 string type, oligo, group;
778                 
779                 while(!inOligos.eof()){
780             
781                         inOligos >> type; 
782             
783                         if(type[0] == '#'){ //ignore
784                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
785                                 m->gobble(inOligos);
786                         }else{
787                                 m->gobble(inOligos);
788                                 //make type case insensitive
789                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
790                                 
791                                 inOligos >> oligo;
792                                 
793                                 for(int i=0;i<oligo.length();i++){
794                                         oligo[i] = toupper(oligo[i]);
795                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
796                                 }
797                                 
798                                 if(type == "FORWARD"){
799                                         // get rest of line in case there is a primer name
800                                         while (!inOligos.eof()) {       
801                         char c = inOligos.get(); 
802                         if (c == 10 || c == 13){        break;  } 
803                         else if (c == 32 || c == 9){;} //space or tab
804                                         } 
805                                         primers.push_back(oligo);
806                 }else if(type == "REVERSE"){
807                     string oligoRC = reverseOligo(oligo);
808                     revPrimer.push_back(oligoRC);
809                     //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
810                                 }else if(type == "BARCODE"){
811                                         inOligos >> group;
812                                 }else if((type == "LINKER")||(type == "SPACER")) {;}
813                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
814                         }
815                         m->gobble(inOligos);
816                 }       
817                 inOligos.close();
818                 
819                 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
820                         m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers.  Please correct."); m->mothurOutEndLine();
821             m->control_pressed = true;
822                         return false;
823                 }
824         
825         return true;
826         
827     }catch(exception& e) {
828                 m->errorOut(e, "PcrSeqsCommand", "readOligos");
829                 exit(1);
830         }
831 }
832 //***************************************************************************************************************
833 bool PcrSeqsCommand::readEcoli(){
834         try {
835                 ifstream in;
836                 m->openInputFile(ecolifile, in);
837                 
838         //read seq
839         if (!in.eof()){ 
840             Sequence ecoli(in); 
841             length = ecoli.getAligned().length();
842             start = ecoli.getStartPos();
843             end = ecoli.getEndPos();
844         }else { in.close(); m->control_pressed = true; return false; }
845         in.close();    
846                         
847         return true;
848     }
849         catch(exception& e) {
850                 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
851                 exit(1);
852         }
853     
854 }
855 //***************************************************************************************************************
856 int PcrSeqsCommand::writeAccnos(set<string> badNames){
857         try {
858                 string thisOutputDir = outputDir;
859                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
860                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
861         outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
862         
863         ofstream out;
864         m->openOutputFile(outputFileName, out);
865         
866         for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
867             if (m->control_pressed) { break; }
868             out << (*it) << endl;
869         }
870         
871         out.close();
872         return 0;
873     }
874         catch(exception& e) {
875                 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
876                 exit(1);
877         }
878     
879 }
880 //******************************************************************/
881 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
882         try {
883                 bool success = 1;
884                 int length = oligo.length();
885                 
886                 for(int i=0;i<length;i++){
887                         
888                         if(oligo[i] != seq[i]){
889                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
890                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
891                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
892                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
893                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
894                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
895                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
896                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
897                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
898                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
899                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
900                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
901                                 
902                                 if(success == 0)        {       break;   }
903                         }
904                         else{
905                                 success = 1;
906                         }
907                 }
908                 
909                 return success;
910         }
911         catch(exception& e) {
912                 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
913                 exit(1);
914         }
915         
916 }
917 //***************************************************************************************************************
918 int PcrSeqsCommand::readName(set<string>& names){
919         try {
920                 string thisOutputDir = outputDir;
921                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
922                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
923         
924                 ofstream out;
925                 m->openOutputFile(outputFileName, out);
926         
927                 ifstream in;
928                 m->openInputFile(namefile, in);
929                 string name, firstCol, secondCol;
930                 
931                 bool wroteSomething = false;
932                 int removedCount = 0;
933                 
934                 while(!in.eof()){
935                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
936                         
937                         in >> firstCol;         m->gobble(in);          
938                         in >> secondCol;                        
939                         
940             string savedSecond = secondCol;
941                         vector<string> parsedNames;
942                         m->splitAtComma(secondCol, parsedNames);
943                         
944                         vector<string> validSecond;  validSecond.clear();
945                         for (int i = 0; i < parsedNames.size(); i++) {
946                                 if (names.count(parsedNames[i]) == 0) {
947                                         validSecond.push_back(parsedNames[i]);
948                                 }
949                         }
950                         
951                         if (validSecond.size() != parsedNames.size()) {  //we want to get rid of someone, so get rid of everyone
952                                 for (int i = 0; i < parsedNames.size(); i++) {  names.insert(parsedNames[i]);  }
953                                 removedCount += parsedNames.size();
954                         }else {
955                 out << firstCol << '\t' << savedSecond << endl;
956                 wroteSomething = true;
957             }
958                         m->gobble(in);
959                 }
960                 in.close();
961                 out.close();
962                 
963                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
964                 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
965                 
966                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
967                 
968                 return 0;
969         }
970         catch(exception& e) {
971                 m->errorOut(e, "PcrSeqsCommand", "readName");
972                 exit(1);
973         }
974 }
975 //**********************************************************************************************************************
976 int PcrSeqsCommand::readGroup(set<string> names){
977         try {
978                 string thisOutputDir = outputDir;
979                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
980                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
981                 
982                 ofstream out;
983                 m->openOutputFile(outputFileName, out);
984         
985                 ifstream in;
986                 m->openInputFile(groupfile, in);
987                 string name, group;
988                 
989                 bool wroteSomething = false;
990                 int removedCount = 0;
991                 
992                 while(!in.eof()){
993                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
994                         
995                         in >> name;                             //read from first column
996                         in >> group;                    //read from second column
997                         
998                         //if this name is in the accnos file
999                         if (names.count(name) == 0) {
1000                                 wroteSomething = true;
1001                                 out << name << '\t' << group << endl;
1002                         }else {  removedCount++;  }
1003             
1004                         m->gobble(in);
1005                 }
1006                 in.close();
1007                 out.close();
1008                 
1009                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
1010                 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1011                 
1012                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1013         
1014                 
1015                 return 0;
1016         }
1017         catch(exception& e) {
1018                 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1019                 exit(1);
1020         }
1021 }
1022 //**********************************************************************************************************************
1023 int PcrSeqsCommand::readTax(set<string> names){
1024         try {
1025                 string thisOutputDir = outputDir;
1026                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
1027                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
1028                 ofstream out;
1029                 m->openOutputFile(outputFileName, out);
1030         
1031                 ifstream in;
1032                 m->openInputFile(taxfile, in);
1033                 string name, tax;
1034                 
1035                 bool wroteSomething = false;
1036                 int removedCount = 0;
1037                 
1038                 while(!in.eof()){
1039                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1040                         
1041                         in >> name;                             //read from first column
1042                         in >> tax;                      //read from second column
1043                         
1044                         //if this name is in the accnos file
1045                         if (names.count(name) == 0) {
1046                                 wroteSomething = true;
1047                                 out << name << '\t' << tax << endl;
1048                         }else {  removedCount++;  }
1049             
1050                         m->gobble(in);
1051                 }
1052                 in.close();
1053                 out.close();
1054                 
1055                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
1056                 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1057                 
1058                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1059                 
1060                 return 0;
1061         }
1062         catch(exception& e) {
1063                 m->errorOut(e, "PcrSeqsCommand", "readTax");
1064                 exit(1);
1065         }
1066 }
1067 /**************************************************************************************/
1068
1069