]> git.donarmstrong.com Git - mothur.git/blob - prcseqscommand.cpp
minor changes to pcr.seqs. Added filterToPos and filterFromPos to sequence class...
[mothur.git] / prcseqscommand.cpp
1 //
2 //  prcseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/14/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "pcrseqscommand.h"
10
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){ 
13         try {
14                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15                 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17         CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18         CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19         CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21                 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22                 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25                 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
26         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28         
29                 vector<string> myArray;
30                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
31                 return myArray;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "PcrSeqsCommand", "setParameters");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 string PcrSeqsCommand::getHelpString(){ 
40         try {
41                 string helpString = "";
42                 helpString += "The pcr.seqs command reads a fasta file ...\n";
43                 
44                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
45                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53
54
55 //**********************************************************************************************************************
56
57 PcrSeqsCommand::PcrSeqsCommand(){       
58         try {
59                 abort = true; calledHelp = true; 
60                 setParameters();
61                 vector<string> tempOutNames;
62                 outputTypes["fasta"] = tempOutNames;
63                 outputTypes["taxonomy"] = tempOutNames;
64                 outputTypes["group"] = tempOutNames;
65                 outputTypes["name"] = tempOutNames;
66         outputTypes["accnos"] = tempOutNames;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
70                 exit(1);
71         }
72 }
73 //***************************************************************************************************************
74
75 PcrSeqsCommand::PcrSeqsCommand(string option)  {
76         try {
77                 
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100                         outputTypes["fasta"] = tempOutNames;
101                         outputTypes["taxonomy"] = tempOutNames;
102                         outputTypes["group"] = tempOutNames;
103                         outputTypes["name"] = tempOutNames;
104             outputTypes["accnos"] = tempOutNames;
105                         
106                         //if the user changes the input directory command factory will send this info to us in the output parameter 
107                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
108                         if (inputDir == "not found"){   inputDir = "";          }
109                         else {
110                                 string path;
111                                 it = parameters.find("fasta");
112                                 //user has given a template file
113                                 if(it != parameters.end()){ 
114                                         path = m->hasPath(it->second);
115                                         //if the user has not given a path then, add inputdir. else leave path alone.
116                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
117                                 }
118                                 
119                                 it = parameters.find("oligos");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
125                                 }
126                 
127                 it = parameters.find("ecoli");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["ecoli"] = inputDir + it->second;            }
133                                 }
134                                 
135                                 it = parameters.find("taxonomy");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
141                                 }
142                                 
143                                 it = parameters.find("name");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
149                                 }
150                 
151                 it = parameters.find("group");
152                                 //user has given a template file
153                                 if(it != parameters.end()){ 
154                                         path = m->hasPath(it->second);
155                                         //if the user has not given a path then, add inputdir. else leave path alone.
156                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
157                                 }
158                                 
159                         }
160             
161             //if the user changes the output directory command factory will send this info to us in the output parameter 
162                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
163                         
164                         //check for required parameters
165                         fastafile = validParameter.validFile(parameters, "fasta", true);
166                         if (fastafile == "not found") {                                 
167                                 fastafile = m->getFastaFile(); 
168                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
169                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
170                         }else if (fastafile == "not open") { fastafile = ""; abort = true; }    
171                         else { m->setFastaFile(fastafile); }
172                         
173             
174                         //check for optional parameter and set defaults
175                         // ...at some point should added some additional type checking...
176                         string temp;
177                         temp = validParameter.validFile(parameters, "keepprimer", false);  if (temp == "not found")    {        temp = "f";     }
178                         keepprimer = m->isTrue(temp);   
179             
180                         temp = validParameter.validFile(parameters, "oligos", true);
181                         if (temp == "not found"){       oligosfile = "";                }
182                         else if(temp == "not open"){    oligosfile = ""; abort = true;  } 
183                         else                                    {       oligosfile = temp; m->setOligosFile(oligosfile);                }
184                         
185             ecolifile = validParameter.validFile(parameters, "ecoli", true);
186                         if (ecolifile == "not found"){  ecolifile = "";         }
187                         else if(ecolifile == "not open"){       ecolifile = ""; abort = true;   } 
188                         
189             namefile = validParameter.validFile(parameters, "name", true);
190                         if (namefile == "not found"){   namefile = "";          }
191                         else if(namefile == "not open"){        namefile = ""; abort = true;    } 
192             else { m->setNameFile(namefile); }
193             
194             groupfile = validParameter.validFile(parameters, "group", true);
195                         if (groupfile == "not found"){  groupfile = "";         }
196                         else if(groupfile == "not open"){       groupfile = ""; abort = true;   } 
197             else { m->setGroupFile(groupfile); }
198             
199             taxfile = validParameter.validFile(parameters, "taxonomy", true);
200                         if (taxfile == "not found"){    taxfile = "";           }
201                         else if(taxfile == "not open"){ taxfile = ""; abort = true;     } 
202             else { m->setTaxonomyFile(taxfile); }
203                         
204                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
205                         m->mothurConvert(temp, pdiffs);
206                         
207                         temp = validParameter.validFile(parameters, "start", false);    if (temp == "not found") { temp = "-1"; }
208                         m->mothurConvert(temp, start);
209             
210             temp = validParameter.validFile(parameters, "end", false);  if (temp == "not found") { temp = "-1"; }
211                         m->mothurConvert(temp, end);
212                         
213                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
214                         m->setProcessors(temp);
215                         m->mothurConvert(temp, processors); 
216                         
217             nomatch = validParameter.validFile(parameters, "nomatch", false);   if (nomatch == "not found") { nomatch = "reject"; }
218                         
219             if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n");  abort = true; }
220             
221             //didnt set anything
222                         if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
223                 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
224             }
225             
226             if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
227             
228             if ((ecolifile != "") && (start != -1) && (end != -1)) {
229                 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend.  When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
230             }
231
232             
233             if ((oligosfile != "") && (ecolifile != "")) {
234                  m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
235             }
236                         
237                         //check to make sure you didn't forget the name file by mistake                 
238                         if (namefile == "") {
239                                 vector<string> files; files.push_back(fastafile);
240                                 parser.getNameFile(files);
241                         }
242                 }
243         
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
247                 exit(1);
248         }
249 }
250 //***************************************************************************************************************
251
252 int PcrSeqsCommand::execute(){
253         try{
254         
255                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
256                 
257         int start = time(NULL);
258         
259         string thisOutputDir = outputDir;
260                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
261                 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
262                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
263         
264         string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
265                 outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile);
266                 
267         length = 0;
268                 if(oligosfile != ""){    readOligos();     }  if (m->control_pressed) {  return 0; }
269         if(ecolifile != "") {    readEcoli();      }  if (m->control_pressed) {  return 0; }
270         
271         vector<unsigned long long> positions; 
272         int numFastaSeqs = 0;
273 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
274         positions = m->divideFile(fastafile, processors);
275         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(linePair(positions[i], positions[(i+1)]));      }
276 #else
277         if (processors == 1) {
278             lines.push_back(linePair(0, 1000));
279         }else {
280             positions = m->setFilePosFasta(fastafile, numFastaSeqs); 
281             if (positions.size() < processors) { processors = positions.size(); }
282             
283             //figure out how many sequences you have to process
284             int numSeqsPerProcessor = numFastaSeqs / processors;
285             for (int i = 0; i < processors; i++) {
286                 int startIndex =  i * numSeqsPerProcessor;
287                 if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
288                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
289             }
290         }
291 #endif
292         if (m->control_pressed) {  return 0; }
293
294         set<string> badNames;
295         if(processors == 1) {    numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]);   }
296         else                {    numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);       }  
297                 
298                 if (m->control_pressed) {  return 0; }          
299         
300         writeAccnos(badNames);      
301         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
302         if (namefile != "")                     {               readName(badNames);             }   
303         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
304         if (groupfile != "")            {               readGroup(badNames);    }
305         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
306                 if (taxfile != "")                      {               readTax(badNames);              }
307                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
308         
309         m->mothurOutEndLine();
310                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
311                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
312                 m->mothurOutEndLine();
313                 m->mothurOutEndLine();
314                 
315                 //set fasta file as new current fastafile
316                 string current = "";
317                 itTypes = outputTypes.find("fasta");
318                 if (itTypes != outputTypes.end()) {
319                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
320                 }
321                 
322                 itTypes = outputTypes.find("name");
323                 if (itTypes != outputTypes.end()) {
324                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
325                 }
326                 
327                 itTypes = outputTypes.find("group");
328                 if (itTypes != outputTypes.end()) {
329                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
330                 }
331                 
332                 itTypes = outputTypes.find("accnos");
333                 if (itTypes != outputTypes.end()) {
334                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
335                 }
336                 
337                 itTypes = outputTypes.find("taxonomy");
338                 if (itTypes != outputTypes.end()) {
339                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
340                 }
341         
342                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
343                 m->mothurOutEndLine();
344
345                 
346                 return 0;       
347         
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "PcrSeqsCommand", "execute");
351                 exit(1);
352         }
353 }
354 /**************************************************************************************************/
355 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
356         try {
357         
358         vector<int> processIDS;   
359         int process = 1;
360                 int num = 0;
361         
362 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
363         
364                 //loop through and create all the processes you want
365                 while (process != processors) {
366                         int pid = fork();
367                         
368                         if (pid > 0) {
369                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
370                                 process++;
371                         }else if (pid == 0){
372                                 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
373                                 
374                                 //pass numSeqs to parent
375                                 ofstream out;
376                                 string tempFile = filename + toString(getpid()) + ".num.temp";
377                                 m->openOutputFile(tempFile, out);
378                                 out << num << '\t' << badSeqNames.size() << endl;
379                 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
380                     out << (*it) << endl;
381                 }
382                                 out.close();
383                                 
384                                 exit(0);
385                         }else { 
386                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
387                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
388                                 exit(0);
389                         }
390                 }
391                 
392         num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
393         
394                 //force parent to wait until all the processes are done
395                 for (int i=0;i<processIDS.size();i++) { 
396                         int temp = processIDS[i];
397                         wait(&temp);
398                 }
399                 
400                 for (int i = 0; i < processIDS.size(); i++) {
401                         ifstream in;
402                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
403                         m->openInputFile(tempFile, in);
404             int numBadNames = 0; string name = "";
405                         if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
406             for (int j = 0; j < numBadNames; j++) {
407                 in >> name; m->gobble(in);
408                 badSeqNames.insert(name);
409             }
410                         in.close(); m->mothurRemove(tempFile);
411             
412             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
413             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
414             
415             m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
416             m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
417                 }
418     #else
419         
420         //////////////////////////////////////////////////////////////////////////////////////////////////////
421                 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct. 
422                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
423                 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
424                 //////////////////////////////////////////////////////////////////////////////////////////////////////
425                 
426                 vector<pcrData*> pDataArray; 
427                 DWORD   dwThreadIdArray[processors-1];
428                 HANDLE  hThreadArray[processors-1]; 
429                 
430                 //Create processor worker threads.
431                 for( int i=0; i<processors-1; i++ ){
432             
433             string extension = "";
434             if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
435             
436                         // Allocate memory for thread data.
437                         pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, start, end, length, lines[i].start, lines[i].end);
438                         pDataArray.push_back(tempPcr);
439                         
440                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
441                         hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
442                 }
443                 
444         //do your part
445         num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
446         processIDS.push_back(processors-1);
447         
448                 //Wait until all threads have terminated.
449                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
450                 
451                 //Close all thread handles and free memory allocations.
452                 for(int i=0; i < pDataArray.size(); i++){
453                         num += pDataArray[i]->count;
454             for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it);       }
455                         CloseHandle(hThreadArray[i]);
456                         delete pDataArray[i];
457                 }
458         
459         for (int i = 0; i < processIDS.size(); i++) {
460             m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
461             m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
462             
463             m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
464             m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
465                 }
466         
467 #endif  
468         
469         return num;
470         
471         }
472         catch(exception& e) {
473                 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
474                 exit(1);
475         }
476 }
477
478 //**********************************************************************************************************************
479 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
480         try {
481                 ofstream goodFile;
482                 m->openOutputFile(goodFasta, goodFile);
483         
484         ofstream badFile;
485                 m->openOutputFile(badFasta, badFile);
486                 
487                 ifstream inFASTA;
488                 m->openInputFile(filename, inFASTA);
489         
490                 inFASTA.seekg(filePos.start);
491         
492                 bool done = false;
493                 int count = 0;
494         set<int> lengths;
495         
496                 while (!done) {
497             
498                         if (m->control_pressed) {  break; }
499                         
500                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
501             
502             string trashCode = "";
503                         if (currSeq.getName() != "") {
504                 
505                 bool goodSeq = true;
506                 if (oligosfile != "") {
507                     map<int, int> mapAligned;
508                     bool aligned = isAligned(currSeq.getAligned(), mapAligned);
509                     
510                     //process primers
511                     if (primers.size() != 0) {
512                         int primerStart = 0; int primerEnd = 0;
513                         bool good = findForward(currSeq, primerStart, primerEnd);
514                         
515                         if(!good){      if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
516                         else{
517                             //are you aligned
518                             if (aligned) { 
519                                 if (!keepprimer)    {  currSeq.filterToPos(mapAligned[primerEnd]); } 
520                                 else                {  currSeq.filterToPos(mapAligned[primerStart]); }
521                             }else { 
522                                 if (!keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
523                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
524                             }
525                         }
526                     }
527                     
528                     //process reverse primers
529                     if (revPrimer.size() != 0) {
530                         int primerStart = 0; int primerEnd = 0;
531                         bool good = findReverse(currSeq, primerStart, primerEnd);
532                         if(!good){      if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
533                         else{ 
534                             //are you aligned
535                             if (aligned) { 
536                                 if (!keepprimer)    {  currSeq.filterFromPos(mapAligned[primerStart]); } 
537                                 else                {  currSeq.filterFromPos(mapAligned[primerEnd]); } 
538                             }
539                             else { 
540                                 if (!keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
541                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
542                             }
543                         }
544                     }
545                 }else if (ecolifile != "") {
546                     //make sure the seqs are aligned
547                     lengths.insert(currSeq.getAligned().length());
548                     if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
549                     else if (currSeq.getAligned().length() != length) {
550                         m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break; 
551                     }else {
552                         currSeq.filterToPos(start); 
553                         currSeq.filterFromPos(end);
554                     }
555                 }else{ //using start and end to trim
556                     //make sure the seqs are aligned
557                     lengths.insert(currSeq.getAligned().length());
558                     if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
559                     else {
560                         if (start != -1) { currSeq.filterToPos(start); }
561                         if (end != -1) {
562                             if (end > currSeq.getAligned().length()) {  m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
563                             else {
564                                 currSeq.filterFromPos(end);
565                             }
566                         }
567                     }
568                 }
569                                     
570                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
571                                 else {  
572                     badSeqNames.insert(currSeq.getName()); 
573                     currSeq.setName(currSeq.getName() + '|' + trashCode);
574                     currSeq.printSequence(badFile); 
575                 }
576                 count++;
577                         }
578                         
579 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
580             unsigned long long pos = inFASTA.tellg();
581             if ((pos == -1) || (pos >= filePos.end)) { break; }
582 #else
583             if (inFASTA.eof()) { break; }
584 #endif
585                         
586                         //report progress
587                         if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
588                 }
589                 //report progress
590                 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine();         }
591                 
592         badFile.close();
593                 goodFile.close();
594                 inFASTA.close();
595                 
596                 return count;
597         }
598         catch(exception& e) {
599                 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
600                 exit(1);
601         }
602 }
603 //********************************************************************/
604 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
605         try {
606                 
607                 string rawSequence = seq.getUnaligned();
608                 
609                 for(int j=0;j<primers.size();j++){
610                         string oligo = primers[j];
611                         
612                         if(rawSequence.length() < oligo.length()) {  break;  }
613                         
614                         //search for primer
615             int olength = oligo.length();
616             for (int j = 0; j < rawSequence.length()-olength; j++){
617                 if (m->control_pressed) {  primerStart = 0; primerEnd = 0; return false; }
618                 string rawChunk = rawSequence.substr(j, olength);
619                 if(compareDNASeq(oligo, rawChunk)) {
620                     primerStart = j;
621                     primerEnd = primerStart + olength;
622                     return true;
623                 }
624                 
625             }
626         }       
627                 
628         primerStart = 0; primerEnd = 0;
629                 return false;
630                 
631         }
632         catch(exception& e) {
633                 m->errorOut(e, "TrimOligos", "stripForward");
634                 exit(1);
635         }
636 }
637 //******************************************************************/
638 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
639         try {
640                 
641                 string rawSequence = seq.getUnaligned();
642                 
643                 for(int i=0;i<revPrimer.size();i++){
644                         string oligo = revPrimer[i];
645                         if(rawSequence.length() < oligo.length()) {  break;  }
646                         
647                         //search for primer
648             int olength = oligo.length();
649             for (int j = rawSequence.length()-olength; j >= 0; j--){
650                  if (m->control_pressed) {  primerStart = 0; primerEnd = 0; return false; }
651                 string rawChunk = rawSequence.substr(j, olength);
652             
653                 if(compareDNASeq(oligo, rawChunk)) {
654                     primerStart = j;
655                     primerEnd = primerStart + olength;
656                     return true;
657                 }
658                 
659             }
660                 }       
661                 
662         primerStart = 0; primerEnd = 0;
663                 return false;
664         }
665         catch(exception& e) {
666                 m->errorOut(e, "PcrSeqsCommand", "findReverse");
667                 exit(1);
668         }
669 }
670 //********************************************************************/
671 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
672         try {
673         bool isAligned = false;
674         
675         int countBases = 0;
676         for (int i = 0; i < seq.length(); i++) {
677             if (!isalpha(seq[i])) { isAligned = true; }
678             else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
679         }                                                   //ie. the 3rd base may be at spot 10 in the alignment
680                                                             //later when we trim we want to trim from spot 10.
681         return isAligned;
682     }
683         catch(exception& e) {
684                 m->errorOut(e, "PcrSeqsCommand", "isAligned");
685                 exit(1);
686         }
687 }
688 //********************************************************************/
689 string PcrSeqsCommand::reverseOligo(string oligo){
690         try {
691         string reverse = "";
692        
693         for(int i=oligo.length()-1;i>=0;i--){
694             
695             if(oligo[i] == 'A')         {       reverse += 'T'; }
696             else if(oligo[i] == 'T'){   reverse += 'A'; }
697             else if(oligo[i] == 'U'){   reverse += 'A'; }
698             
699             else if(oligo[i] == 'G'){   reverse += 'C'; }
700             else if(oligo[i] == 'C'){   reverse += 'G'; }
701             
702             else if(oligo[i] == 'R'){   reverse += 'Y'; }
703             else if(oligo[i] == 'Y'){   reverse += 'R'; }
704             
705             else if(oligo[i] == 'M'){   reverse += 'K'; }
706             else if(oligo[i] == 'K'){   reverse += 'M'; }
707             
708             else if(oligo[i] == 'W'){   reverse += 'W'; }
709             else if(oligo[i] == 'S'){   reverse += 'S'; }
710             
711             else if(oligo[i] == 'B'){   reverse += 'V'; }
712             else if(oligo[i] == 'V'){   reverse += 'B'; }
713             
714             else if(oligo[i] == 'D'){   reverse += 'H'; }
715             else if(oligo[i] == 'H'){   reverse += 'D'; }
716
717             else                                                {       reverse += 'N'; }
718         }
719
720         
721         return reverse;
722     }
723         catch(exception& e) {
724                 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
725                 exit(1);
726         }
727 }
728
729 //***************************************************************************************************************
730 bool PcrSeqsCommand::readOligos(){
731         try {
732                 ifstream inOligos;
733                 m->openInputFile(oligosfile, inOligos);
734                 
735                 string type, oligo, group;
736                 
737                 while(!inOligos.eof()){
738             
739                         inOligos >> type; 
740             
741                         if(type[0] == '#'){ //ignore
742                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
743                                 m->gobble(inOligos);
744                         }else{
745                                 m->gobble(inOligos);
746                                 //make type case insensitive
747                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
748                                 
749                                 inOligos >> oligo;
750                                 
751                                 for(int i=0;i<oligo.length();i++){
752                                         oligo[i] = toupper(oligo[i]);
753                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
754                                 }
755                                 
756                                 if(type == "FORWARD"){
757                                         // get rest of line in case there is a primer name
758                                         while (!inOligos.eof()) {       
759                         char c = inOligos.get(); 
760                         if (c == 10 || c == 13){        break;  } 
761                         else if (c == 32 || c == 9){;} //space or tab
762                                         } 
763                                         primers.push_back(oligo);
764                 }else if(type == "REVERSE"){
765                     string oligoRC = reverseOligo(oligo);
766                     revPrimer.push_back(oligoRC);
767                     //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
768                                 }else if(type == "BARCODE"){
769                                         inOligos >> group;
770                                 }else if((type == "LINKER")||(type == "SPACER")) {;}
771                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
772                         }
773                         m->gobble(inOligos);
774                 }       
775                 inOligos.close();
776                 
777                 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
778                         m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers.  Please correct."); m->mothurOutEndLine();
779             m->control_pressed = true;
780                         return false;
781                 }
782         
783         return true;
784         
785     }catch(exception& e) {
786                 m->errorOut(e, "PcrSeqsCommand", "readOligos");
787                 exit(1);
788         }
789 }
790 //***************************************************************************************************************
791 bool PcrSeqsCommand::readEcoli(){
792         try {
793                 ifstream in;
794                 m->openInputFile(ecolifile, in);
795                 
796         //read seq
797         if (!in.eof()){ 
798             Sequence ecoli(in); 
799             length = ecoli.getAligned().length();
800             start = ecoli.getStartPos();
801             end = ecoli.getEndPos();
802         }else { in.close(); m->control_pressed = true; return false; }
803         in.close();    
804                         
805         return true;
806     }
807         catch(exception& e) {
808                 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
809                 exit(1);
810         }
811     
812 }
813 //***************************************************************************************************************
814 int PcrSeqsCommand::writeAccnos(set<string> badNames){
815         try {
816                 string thisOutputDir = outputDir;
817                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
818                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
819         outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
820         
821         ofstream out;
822         m->openOutputFile(outputFileName, out);
823         
824         for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
825             if (m->control_pressed) { break; }
826             out << (*it) << endl;
827         }
828         
829         out.close();
830         return 0;
831     }
832         catch(exception& e) {
833                 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
834                 exit(1);
835         }
836     
837 }
838 //******************************************************************/
839 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
840         try {
841                 bool success = 1;
842                 int length = oligo.length();
843                 
844                 for(int i=0;i<length;i++){
845                         
846                         if(oligo[i] != seq[i]){
847                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
848                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
849                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
850                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
851                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
852                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
853                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
854                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
855                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
856                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
857                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
858                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
859                                 
860                                 if(success == 0)        {       break;   }
861                         }
862                         else{
863                                 success = 1;
864                         }
865                 }
866                 
867                 return success;
868         }
869         catch(exception& e) {
870                 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
871                 exit(1);
872         }
873         
874 }
875 //***************************************************************************************************************
876 int PcrSeqsCommand::readName(set<string>& names){
877         try {
878                 string thisOutputDir = outputDir;
879                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
880                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
881         
882                 ofstream out;
883                 m->openOutputFile(outputFileName, out);
884         
885                 ifstream in;
886                 m->openInputFile(namefile, in);
887                 string name, firstCol, secondCol;
888                 
889                 bool wroteSomething = false;
890                 int removedCount = 0;
891                 
892                 while(!in.eof()){
893                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
894                         
895                         in >> firstCol;         m->gobble(in);          
896                         in >> secondCol;                        
897                         
898             string savedSecond = secondCol;
899                         vector<string> parsedNames;
900                         m->splitAtComma(secondCol, parsedNames);
901                         
902                         vector<string> validSecond;  validSecond.clear();
903                         for (int i = 0; i < parsedNames.size(); i++) {
904                                 if (names.count(parsedNames[i]) == 0) {
905                                         validSecond.push_back(parsedNames[i]);
906                                 }
907                         }
908                         
909                         if (validSecond.size() != parsedNames.size()) {  //we want to get rid of someone, so get rid of everyone
910                                 for (int i = 0; i < parsedNames.size(); i++) {  names.insert(parsedNames[i]);  }
911                                 removedCount += parsedNames.size();
912                         }else {
913                 out << firstCol << '\t' << savedSecond << endl;
914                 wroteSomething = true;
915             }
916                         m->gobble(in);
917                 }
918                 in.close();
919                 out.close();
920                 
921                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
922                 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
923                 
924                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
925                 
926                 return 0;
927         }
928         catch(exception& e) {
929                 m->errorOut(e, "PcrSeqsCommand", "readName");
930                 exit(1);
931         }
932 }
933 //**********************************************************************************************************************
934 int PcrSeqsCommand::readGroup(set<string> names){
935         try {
936                 string thisOutputDir = outputDir;
937                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
938                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
939                 
940                 ofstream out;
941                 m->openOutputFile(outputFileName, out);
942         
943                 ifstream in;
944                 m->openInputFile(groupfile, in);
945                 string name, group;
946                 
947                 bool wroteSomething = false;
948                 int removedCount = 0;
949                 
950                 while(!in.eof()){
951                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
952                         
953                         in >> name;                             //read from first column
954                         in >> group;                    //read from second column
955                         
956                         //if this name is in the accnos file
957                         if (names.count(name) == 0) {
958                                 wroteSomething = true;
959                                 out << name << '\t' << group << endl;
960                         }else {  removedCount++;  }
961             
962                         m->gobble(in);
963                 }
964                 in.close();
965                 out.close();
966                 
967                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
968                 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
969                 
970                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
971         
972                 
973                 return 0;
974         }
975         catch(exception& e) {
976                 m->errorOut(e, "PcrSeqsCommand", "readGroup");
977                 exit(1);
978         }
979 }
980 //**********************************************************************************************************************
981 int PcrSeqsCommand::readTax(set<string> names){
982         try {
983                 string thisOutputDir = outputDir;
984                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
985                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
986                 ofstream out;
987                 m->openOutputFile(outputFileName, out);
988         
989                 ifstream in;
990                 m->openInputFile(taxfile, in);
991                 string name, tax;
992                 
993                 bool wroteSomething = false;
994                 int removedCount = 0;
995                 
996                 while(!in.eof()){
997                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
998                         
999                         in >> name;                             //read from first column
1000                         in >> tax;                      //read from second column
1001                         
1002                         //if this name is in the accnos file
1003                         if (names.count(name) == 0) {
1004                                 wroteSomething = true;
1005                                 out << name << '\t' << tax << endl;
1006                         }else {  removedCount++;  }
1007             
1008                         m->gobble(in);
1009                 }
1010                 in.close();
1011                 out.close();
1012                 
1013                 if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
1014                 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1015                 
1016                 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1017                 
1018                 return 0;
1019         }
1020         catch(exception& e) {
1021                 m->errorOut(e, "PcrSeqsCommand", "readTax");
1022                 exit(1);
1023         }
1024 }
1025 /**************************************************************************************/
1026
1027