]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
some of pat's small modifications
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimFlowsCommand::setParameters(){       
15         try {
16                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pflow);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop);
19                 CommandParameter pmaxflows("maxflows", "Number", "", "720", "", "", "",false,false); parameters.push_back(pmaxflows);
20                 CommandParameter pminflows("minflows", "Number", "", "360", "", "", "",false,false); parameters.push_back(pminflows);
21                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
22                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
23                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
24                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25                 CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
26                 CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
27                 CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
28                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
29                 CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta);
30                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
31                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
32                 
33                 vector<string> myArray;
34                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
35                 return myArray;
36         }
37         catch(exception& e) {
38                 m->errorOut(e, "TrimFlowsCommand", "setParameters");
39                 exit(1);
40         }
41 }
42 //**********************************************************************************************************************
43 string TrimFlowsCommand::getHelpString(){       
44         try {
45                 string helpString = "";
46                 helpString += "The trim.flows command reads a flowgram file and creates .....\n";
47                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
48                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n";
49                 return helpString;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "TrimFlowsCommand", "getHelpString");
53                 exit(1);
54         }
55 }
56
57 //**********************************************************************************************************************
58
59 TrimFlowsCommand::TrimFlowsCommand(){   
60         try {
61                 abort = true; calledHelp = true; 
62                 setParameters();
63                 vector<string> tempOutNames;
64                 outputTypes["flow"] = tempOutNames;
65                 outputTypes["fasta"] = tempOutNames;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
69                 exit(1);
70         }
71 }
72 //**********************************************************************************************************************
73
74 TrimFlowsCommand::TrimFlowsCommand(string option)  {
75         try {
76                 
77                 abort = false; calledHelp = false;   
78                 comboStarts = 0;
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                                                 
86                         vector<string> myArray = setParameters();
87                         
88                         OptionParser parser(option);
89                         map<string,string> parameters = parser.getParameters();
90                         
91                         ValidParameters validParameter;
92                         map<string,string>::iterator it;
93                         
94                         //check to make sure all parameters are valid for command
95                         for (it = parameters.begin(); it != parameters.end(); it++) { 
96                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
97                         }
98                         
99                         //initialize outputTypes
100                         vector<string> tempOutNames;
101                         outputTypes["flow"] = tempOutNames;
102                         outputTypes["fasta"] = tempOutNames;
103                         
104                         //if the user changes the input directory command factory will send this info to us in the output parameter 
105                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
106                         if (inputDir == "not found"){   inputDir = "";          }
107                         else {
108                                 string path;
109                                 it = parameters.find("flow");
110                                 //user has given a template file
111                                 if(it != parameters.end()){ 
112                                         path = m->hasPath(it->second);
113                                         //if the user has not given a path then, add inputdir. else leave path alone.
114                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
115                                 }
116                                 
117                                 it = parameters.find("oligos");
118                                 //user has given a template file
119                                 if(it != parameters.end()){ 
120                                         path = m->hasPath(it->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
123                                 }
124                                 
125                         }
126                         
127                         
128                         //check for required parameters
129                         flowFileName = validParameter.validFile(parameters, "flow", true);
130                         if (flowFileName == "not found") { 
131                                 flowFileName = m->getFlowFile(); 
132                                 if (flowFileName != "") {  m->mothurOut("Using " + flowFileName + " as input file for the flow parameter."); m->mothurOutEndLine(); }
133                                 else { 
134                                         m->mothurOut("No valid current flow file. You must provide a flow file."); m->mothurOutEndLine(); 
135                                         abort = true;
136                                 } 
137                         }else if (flowFileName == "not open") { flowFileName = ""; abort = true; }      
138                         
139                         //if the user changes the output directory command factory will send this info to us in the output parameter 
140                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
141                                 outputDir = ""; 
142                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
143                         }
144                         
145                         
146                         //check for optional parameter and set defaults
147                         // ...at some point should added some additional type checking...
148                         
149                         string temp;
150                         temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "450"; }
151                         convert(temp, minFlows);  
152
153                         temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "450"; }
154                         convert(temp, maxFlows);  
155                         
156                         
157                         temp = validParameter.validFile(parameters, "oligos", true);
158                         if (temp == "not found")        {       oligoFileName = "";             }
159                         else if(temp == "not open")     {       abort = true;                   } 
160                         else                                            {       oligoFileName = temp;   }
161                         
162                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
163                         else if(m->isTrue(temp))        {       fasta = 1;      }
164                         
165                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
166                         convert(temp, maxHomoP);  
167
168                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
169                         convert(temp, signal);  
170
171                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
172                         convert(temp, noise);  
173         
174                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
175                         convert(temp, bdiffs);
176                         
177                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
178                         convert(temp, pdiffs);
179                         
180                         temp = validParameter.validFile(parameters, "tdiffs", false);
181                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
182                         convert(temp, tdiffs);
183                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
184                         
185                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
186                         m->setProcessors(temp);
187                         convert(temp, processors);
188         
189                         flowOrder = validParameter.validFile(parameters, "order", false);
190                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
191                         else if(flowOrder.length() != 4){
192                                 m->mothurOut("The value of the order option must be four bases long\n");
193                         }
194
195                         if(oligoFileName == "") {       allFiles = 0;           }
196                         else                                    {       allFiles = 1;           }
197
198                         numFPrimers = 0;
199                         numRPrimers = 0;
200                 }
201                 
202         }
203         catch(exception& e) {
204                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
205                 exit(1);
206         }
207 }
208
209 //***************************************************************************************************************
210
211 int TrimFlowsCommand::execute(){
212         try{
213                 
214                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
215
216                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
217                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
218                 
219                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
220                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
221
222                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
223                 if(fasta){
224                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
225                 }
226                 
227                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
228                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
229                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
230                 }       
231
232                 vector<vector<string> > barcodePrimerComboFileNames;
233                 if(oligoFileName != ""){
234                         getOligos(barcodePrimerComboFileNames); 
235                 }
236                 
237 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
238                 if(processors == 1){
239                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
240                 }else{
241                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
242                 }       
243 #else
244                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
245 #endif
246                 
247                 if (m->control_pressed) {  return 0; }                  
248                 
249                 string flowFilesFileName;
250                 ofstream output;
251                 
252                 if(allFiles){
253                         
254                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
255                         m->openOutputFile(flowFilesFileName, output);
256
257                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
258                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
259                                         
260                                         FILE * pFile;
261                                         unsigned long int size;
262                                         
263                                         //get num bytes in file
264                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
265                                         if (pFile==NULL) perror ("Error opening file");
266                                         else{
267                                                 fseek (pFile, 0, SEEK_END);
268                                                 size=ftell(pFile);
269                                                 fclose (pFile);
270                                         }
271
272                                         if(size < 10){
273                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
274                                         }
275                                         else{
276                                                 output << barcodePrimerComboFileNames[i][j] << endl;
277                                                 outputNames.push_back(barcodePrimerComboFileNames[i][j]);
278                                                 outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
279                                         }
280                                 }
281                         }
282                         output.close();
283                 }
284                 else{
285                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
286                         m->openOutputFile(flowFilesFileName, output);
287                         
288                         output << trimFlowFileName << endl;
289                         
290                         output.close();
291                 }
292                 outputTypes["flow.files"].push_back(flowFilesFileName);
293                 outputNames.push_back(flowFilesFileName);
294                 
295 //              set fasta file as new current fastafile
296 //              string current = "";
297 //              itTypes = outputTypes.find("fasta");
298 //              if (itTypes != outputTypes.end()) {
299 //                      if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
300 //              }
301                 
302                 m->mothurOutEndLine();
303                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
304                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
305                 m->mothurOutEndLine();
306                 
307                 return 0;       
308         }
309         catch(exception& e) {
310                 m->errorOut(e, "TrimSeqsCommand", "execute");
311                 exit(1);
312         }
313 }
314
315 //***************************************************************************************************************
316
317 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
318         
319         try {
320                 
321                 ofstream trimFlowFile;
322                 m->openOutputFile(trimFlowFileName, trimFlowFile);
323                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
324
325                 ofstream scrapFlowFile;
326                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
327                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
328
329                 if(line->start == 4){
330                         scrapFlowFile << maxFlows << endl;
331                         trimFlowFile << maxFlows << endl;
332                         if(allFiles){
333                                 for(int i=0;i<barcodePrimerComboFileNames.size();i++){
334                                         for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
335                                                 //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
336                                                 ofstream temp;
337                                                 m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
338                                                 temp << maxFlows << endl;
339                                                 temp.close();
340                                         }
341                                 }                       
342                         }
343                 }
344                 
345                 FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
346                 
347                 ofstream fastaFile;
348                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
349                 
350                 ifstream flowFile;
351                 m->openInputFile(flowFileName, flowFile);
352                 
353                 flowFile.seekg(line->start);
354
355                 int count = 0;
356                 bool moreSeqs = 1;
357                         
358                 while(moreSeqs) {
359                         
360                         int success = 1;
361                         int currentSeqDiffs = 0;
362                         string trashCode = "";
363                         
364                         flowData.getNext(flowFile);
365                         flowData.capFlows(maxFlows);    
366                         
367                         Sequence currSeq = flowData.getSequence();
368                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
369                                 success = 0;
370                                 trashCode += 'l';
371                         }
372                         
373                         int primerIndex = 0;
374                         int barcodeIndex = 0;
375                         
376                         if(barcodes.size() != 0){
377                                 success = stripBarcode(currSeq, barcodeIndex);
378                                 if(success > bdiffs)            {       trashCode += 'b';       }
379                                 else{ currentSeqDiffs += success;  }
380                         }
381                         
382                         if(numFPrimers != 0){
383                                 success = stripForward(currSeq, primerIndex);
384                                 if(success > pdiffs)            {       trashCode += 'f';       }
385                                 else{ currentSeqDiffs += success;  }
386                         }
387                         
388                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
389                         
390                         if(numRPrimers != 0){
391                                 success = stripReverse(currSeq);
392                                 if(!success)                            {       trashCode += 'r';       }
393                         }
394
395                         if(trashCode.length() == 0){
396                                                         
397                                 flowData.printFlows(trimFlowFile);
398                         
399                                 if(fasta)       {       currSeq.printSequence(fastaFile);       }
400                                 
401                                 if(allFiles){
402                                         ofstream output;
403                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
404                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
405                                         
406                                         flowData.printFlows(output);
407                                         output.close();
408                                 }                               
409                         }
410                         else{
411                                 flowData.printFlows(scrapFlowFile, trashCode);
412                         }
413                                 
414                         count++;
415                                                 
416                         //report progress
417                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
418
419 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
420                         unsigned long int pos = flowFile.tellg();
421
422                         if ((pos == -1) || (pos >= line->end)) { break; }
423 #else
424                         if (flowFile.eof()) { break; }
425 #endif
426                         
427                 }
428                 //report progress
429                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
430                 
431                 trimFlowFile.close();
432                 scrapFlowFile.close();
433                 flowFile.close();
434                 if(fasta){      fastaFile.close();      }
435                 
436                 return count;
437         }
438         catch(exception& e) {
439                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
440                 exit(1);
441         }
442 }
443
444 //***************************************************************************************************************
445
446 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
447         try {
448                 ifstream oligosFile;
449                 m->openInputFile(oligoFileName, oligosFile);
450                 
451                 string type, oligo, group;
452
453                 int indexPrimer = 0;
454                 int indexBarcode = 0;
455                 
456                 while(!oligosFile.eof()){
457                 
458                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
459
460                         if(type[0] == '#'){     //igore the line because there's a comment
461                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
462                         }
463                         else{                           //there's a feature we're interested in
464
465                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
466
467                                 oligosFile >> oligo;    //get the DNA sequence for the feature
468
469                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
470                                         oligo[i] = toupper(oligo[i]);
471                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
472                                 }
473
474                                 if(type == "FORWARD"){  //if the feature is a forward primer...
475                                         group = "";
476
477                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
478                                                 char c = oligosFile.get(); 
479                                                 if (c == 10 || c == 13){        break;  }
480                                                 else if (c == 32 || c == 9){;} //space or tab
481                                                 else {  group += c;  }
482                                         } 
483
484                                         //have we seen this primer already?
485                                         map<string, int>::iterator itPrimer = primers.find(oligo);
486                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
487
488                                         primers[oligo]=indexPrimer; indexPrimer++;
489                                         primerNameVector.push_back(group);
490
491                                 }
492                                 else if(type == "REVERSE"){
493                                         Sequence oligoRC("reverse", oligo);
494                                         oligoRC.reverseComplement();
495                                         revPrimer.push_back(oligoRC.getUnaligned());
496                                 }
497                                 else if(type == "BARCODE"){
498                                         oligosFile >> group;
499
500                                         //check for repeat barcodes
501                                         map<string, int>::iterator itBar = barcodes.find(oligo);
502                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
503
504                                         barcodes[oligo]=indexBarcode; indexBarcode++;
505                                         barcodeNameVector.push_back(group);
506                                 }
507                                 else{
508                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
509                                 }
510                         }
511
512                         m->gobble(oligosFile);
513                 }
514                 oligosFile.close();
515                 
516                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
517                 
518                 //add in potential combos
519                 if(barcodeNameVector.size() == 0){
520                         barcodes[""] = 0;
521                         barcodeNameVector.push_back("");                        
522                 }
523                 
524                 if(primerNameVector.size() == 0){
525                         primers[""] = 0;
526                         primerNameVector.push_back("");                 
527                 }
528                 
529                 
530                 outFlowFileNames.resize(barcodeNameVector.size());
531                 for(int i=0;i<outFlowFileNames.size();i++){
532                         outFlowFileNames[i].assign(primerNameVector.size(), "");
533                 }
534                 
535                 if(allFiles){
536
537                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
538                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
539
540                                         string primerName = primerNameVector[itPrimer->second];
541                                         string barcodeName = barcodeNameVector[itBar->second];
542                                                                                 
543                                         string comboGroupName = "";
544                                         string fileName = "";
545                                         
546                                         if(primerName == ""){
547                                                 comboGroupName = barcodeNameVector[itBar->second];
548                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
549                                         }
550                                         else{
551                                                 if(barcodeName == ""){
552                                                         comboGroupName = primerNameVector[itPrimer->second];
553                                                 }
554                                                 else{
555                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
556                                                 }
557                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
558                                         }
559                                         
560                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
561                                         
562                                         ofstream temp;
563                                         m->openOutputFile(fileName, temp);
564                                         temp.close();
565                                 }
566                         }
567                 }
568                 
569                 numFPrimers = primers.size();
570                 numRPrimers = revPrimer.size();
571                 
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
575                 exit(1);
576         }
577 }
578
579 //***************************************************************************************************************
580
581 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
582         try {
583                 
584                 string rawSequence = seq.getUnaligned();
585                 int success = bdiffs + 1;       //guilty until proven innocent
586                 
587                 //can you find the barcode
588                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
589                         string oligo = it->first;
590                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
591                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
592                                 break;  
593                         }
594                         
595                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
596                                 group = it->second;
597                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
598                                 
599                                 success = 0;
600                                 break;
601                         }
602                 }
603                 
604                 //if you found the barcode or if you don't want to allow for diffs
605                 if ((bdiffs == 0) || (success == 0)) { return success;  }
606                 
607                 else { //try aligning and see if you can find it
608                         
609                         int maxLength = 0;
610                         
611                         Alignment* alignment;
612                         if (barcodes.size() > 0) {
613                                 map<string,int>::iterator it=barcodes.begin();
614                                 
615                                 for(it;it!=barcodes.end();it++){
616                                         if(it->first.length() > maxLength){
617                                                 maxLength = it->first.length();
618                                         }
619                                 }
620                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
621                                 
622                         }else{ alignment = NULL; } 
623                         
624                         //can you find the barcode
625                         int minDiff = 1e6;
626                         int minCount = 1;
627                         int minGroup = -1;
628                         int minPos = 0;
629                         
630                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
631                                 string oligo = it->first;
632                                 //                              int length = oligo.length();
633                                 
634                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
635                                         success = bdiffs + 10;
636                                         break;
637                                 }
638                                 
639                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
640                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
641                                 oligo = alignment->getSeqAAln();
642                                 string temp = alignment->getSeqBAln();
643                                 
644                                 int alnLength = oligo.length();
645                                 
646                                 for(int i=oligo.length()-1;i>=0;i--){
647                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
648                                 }
649                                 oligo = oligo.substr(0,alnLength);
650                                 temp = temp.substr(0,alnLength);
651                                 
652                                 int numDiff = countDiffs(oligo, temp);
653                                 
654                                 if(numDiff < minDiff){
655                                         minDiff = numDiff;
656                                         minCount = 1;
657                                         minGroup = it->second;
658                                         minPos = 0;
659                                         for(int i=0;i<alnLength;i++){
660                                                 if(temp[i] != '-'){
661                                                         minPos++;
662                                                 }
663                                         }
664                                 }
665                                 else if(numDiff == minDiff){
666                                         minCount++;
667                                 }
668                                 
669                         }
670                         
671                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
672                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
673                         else{                                                                                                   //use the best match
674                                 group = minGroup;
675                                 seq.setUnaligned(rawSequence.substr(minPos));
676                                 success = minDiff;
677                         }
678                         
679                         if (alignment != NULL) {  delete alignment;  }
680                         
681                 }
682                 
683                 return success;
684                 
685         }
686         catch(exception& e) {
687                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
688                 exit(1);
689         }
690         
691 }
692
693 //***************************************************************************************************************
694
695 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
696         try {
697                 
698                 string rawSequence = seq.getUnaligned();
699                 int success = pdiffs + 1;       //guilty until proven innocent
700
701                 //can you find the primer
702                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
703                         string oligo = it->first;
704                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
705                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
706                                 break;  
707                         }
708                         
709                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
710                                 group = it->second;
711                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
712                                 success = 0;
713                                 break;
714                         }
715                 }
716                 
717                 //if you found the barcode or if you don't want to allow for diffs
718                 if ((pdiffs == 0) || (success == 0)) {  return success;  }
719                 
720                 else { //try aligning and see if you can find it
721                         
722                         int maxLength = 0;
723                         
724                         Alignment* alignment;
725                         if (primers.size() > 0) {
726                                 map<string,int>::iterator it=primers.begin();
727                                 
728                                 for(it;it!=primers.end();it++){
729                                         if(it->first.length() > maxLength){
730                                                 maxLength = it->first.length();
731                                         }
732                                 }
733                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
734                                 
735                         }else{ alignment = NULL; } 
736                         
737                         //can you find the barcode
738                         int minDiff = 1e6;
739                         int minCount = 1;
740                         int minGroup = -1;
741                         int minPos = 0;
742                         
743                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
744                                 string oligo = it->first;
745                                 //                              int length = oligo.length();
746                                 
747                                 if(rawSequence.length() < maxLength){   
748                                         success = pdiffs + 100;
749                                         break;
750                                 }
751                                 
752                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
753                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
754                                 oligo = alignment->getSeqAAln();
755                                 string temp = alignment->getSeqBAln();
756                                 
757                                 int alnLength = oligo.length();
758                                 
759                                 for(int i=oligo.length()-1;i>=0;i--){
760                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
761                                 }
762                                 oligo = oligo.substr(0,alnLength);
763                                 temp = temp.substr(0,alnLength);
764                                 
765                                 int numDiff = countDiffs(oligo, temp);
766                                 
767                                 if(numDiff < minDiff){
768                                         minDiff = numDiff;
769                                         minCount = 1;
770                                         minGroup = it->second;
771                                         minPos = 0;
772                                         for(int i=0;i<alnLength;i++){
773                                                 if(temp[i] != '-'){
774                                                         minPos++;
775                                                 }
776                                         }
777                                 }
778                                 else if(numDiff == minDiff){
779                                         minCount++;
780                                 }
781                                 
782                         }
783                         
784                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
785                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
786                         else{                                                                                                   //use the best match
787                                 group = minGroup;
788                                 seq.setUnaligned(rawSequence.substr(minPos));
789                                 success = minDiff;
790                         }
791                         
792                         if (alignment != NULL) {  delete alignment;  }
793                         
794                 }
795                 
796                 return success;
797                 
798         }
799         catch(exception& e) {
800                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
801                 exit(1);
802         }
803 }
804
805 //***************************************************************************************************************
806
807 bool TrimFlowsCommand::stripReverse(Sequence& seq){
808         try {
809
810                 string rawSequence = seq.getUnaligned();
811                 bool success = 0;       //guilty until proven innocent
812                 
813                 for(int i=0;i<numRPrimers;i++){
814                         string oligo = revPrimer[i];
815                         
816                         if(rawSequence.length() < oligo.length()){
817                                 success = 0;
818                                 break;
819                         }
820                         
821                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
822                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
823                                 success = 1;
824                                 break;
825                         }
826                 }       
827
828                 return success;
829                 
830         }
831         catch(exception& e) {
832                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
833                 exit(1);
834         }
835 }
836
837
838 //***************************************************************************************************************
839
840 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
841         try {
842                 bool success = 1;
843                 int length = oligo.length();
844                 
845                 for(int i=0;i<length;i++){
846                         
847                         if(oligo[i] != seq[i]){
848                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
849                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
850                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
851                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
852                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
853                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
854                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
855                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
856                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
857                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
858                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
859                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
860                                 
861                                 if(success == 0)        {       break;   }
862                         }
863                         else{
864                                 success = 1;
865                         }
866                 }
867                 
868                 return success;
869         }
870         catch(exception& e) {
871                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
872                 exit(1);
873         }
874         
875 }
876
877 //***************************************************************************************************************
878
879 int TrimFlowsCommand::countDiffs(string oligo, string seq){
880         try {
881                 
882                 int length = oligo.length();
883                 int countDiffs = 0;
884                 
885                 for(int i=0;i<length;i++){
886                         
887                         if(oligo[i] != seq[i]){
888                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
889                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
890                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
891                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
892                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
893                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
894                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
895                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
896                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
897                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
898                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
899                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
900                         }
901                         
902                 }
903                 
904                 return countDiffs;
905         }
906         catch(exception& e) {
907                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
908                 exit(1);
909         }
910         
911 }
912
913 /**************************************************************************************************/
914
915 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
916
917         try{
918                         
919                 vector<unsigned long int> filePos;
920                 filePos.push_back(0);
921                                         
922                 FILE * pFile;
923                 unsigned long int size;
924                 
925                 //get num bytes in file
926                 pFile = fopen (flowFileName.c_str(),"rb");
927                 if (pFile==NULL) perror ("Error opening file");
928                 else{
929                         fseek (pFile, 0, SEEK_END);
930                         size=ftell (pFile);
931                         fclose (pFile);
932                 }
933                                 
934                 //estimate file breaks
935                 unsigned long int chunkSize = 0;
936                 chunkSize = size / processors;
937
938                 //file too small to divide by processors
939                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
940                 
941                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
942                 for (int i = 0; i < processors; i++) {
943                         unsigned long int spot = (i+1) * chunkSize;
944                         
945                         ifstream in;
946                         m->openInputFile(flowFileName, in);
947                         in.seekg(spot);
948                         
949                         string dummy = m->getline(in);
950                         
951                         //there was not another sequence before the end of the file
952                         unsigned long int sanityPos = in.tellg();
953                         
954 //                      if (sanityPos == -1) {  break;  }
955 //                      else {  filePos.push_back(newSpot);  }
956                         if (sanityPos == -1) {  break;  }
957                         else {  filePos.push_back(sanityPos);  }
958                         
959                         in.close();
960                 }
961                 
962                 //save end pos
963                 filePos.push_back(size);
964                 
965                 //sanity check filePos
966                 for (int i = 0; i < (filePos.size()-1); i++) {
967                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
968                 }
969
970                 ifstream in;
971                 m->openInputFile(flowFileName, in);
972                 in >> numFlows;
973                 m->gobble(in);
974                 unsigned long int spot = in.tellg();
975                 filePos[0] = spot;
976                 in.close();
977                 
978                 processors = (filePos.size() - 1);
979                 
980                 return filePos; 
981         }
982         catch(exception& e) {
983                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
984                 exit(1);
985         }
986 }
987
988 /**************************************************************************************************/
989
990 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
991
992         try {
993 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
994                 int process = 1;
995                 int exitCommand = 1;
996                 processIDS.clear();
997                 
998                 //loop through and create all the processes you want
999                 while (process != processors) {
1000                         int pid = fork();
1001                         
1002                         if (pid > 0) {
1003                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1004                                 process++;
1005                         }else if (pid == 0){
1006                                 
1007                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1008                                 if(allFiles){
1009                                         for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1010                                                 for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1011                                                         tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1012                                                         ofstream temp;
1013                                                         m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1014                                                         temp.close();
1015                                                         
1016                                                 }
1017                                         }
1018                                 }
1019                                 driverCreateTrim(flowFileName,
1020                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1021                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1022                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1023                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1024
1025                                 exit(0);
1026                         }else { 
1027                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1028                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1029                                 exit(0);
1030                         }
1031                 }
1032                 
1033                 //parent do my part
1034                 ofstream temp;
1035                 m->openOutputFile(trimFlowFileName, temp);
1036                 temp.close();
1037
1038                 m->openOutputFile(scrapFlowFileName, temp);
1039                 temp.close();
1040                 
1041                 if(fasta){
1042                         m->openOutputFile(fastaFileName, temp);
1043                         temp.close();
1044                 }
1045                 
1046                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1047
1048                 //force parent to wait until all the processes are done
1049                 for (int i=0;i<processIDS.size();i++) { 
1050                         int temp = processIDS[i];
1051                         wait(&temp);
1052                 }
1053                 
1054                 //append files
1055                 m->mothurOutEndLine();
1056                 for(int i=0;i<processIDS.size();i++){
1057                         
1058                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1059                         
1060                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1061                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1062 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1063
1064                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1065                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1066 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1067
1068                         if(fasta){
1069                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1070                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1071 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1072                         }
1073                         if(allFiles){                                           
1074                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1075                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1076                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1077                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1078                                         }
1079                                 }
1080                         }
1081                 }
1082                 
1083                 return exitCommand;
1084 #endif          
1085         }
1086         catch(exception& e) {
1087                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1088                 exit(1);
1089         }
1090 }
1091
1092 //***************************************************************************************************************