]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
added shhh.seqs command
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimFlowsCommand::getValidParameters(){  
16         try {
17                 string Array[] =  {"flow", "totalflows", "minflows",
18                         "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise"
19                         "oligos", "pdiffs", "bdiffs", "tdiffs", 
20                         "allfiles", "processors",
21
22
23                         
24 //                      "group",
25                         "outputdir","inputdir"
26                 
27                 };
28                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "TrimFlowsCommand", "getValidParameters");
33                 exit(1);
34         }
35 }
36
37 //**********************************************************************************************************************
38
39 vector<string> TrimFlowsCommand::getRequiredParameters(){       
40         try {
41                 string Array[] =  {"flow"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimFlowsCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50
51 //**********************************************************************************************************************
52
53 vector<string> TrimFlowsCommand::getRequiredFiles(){    
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "TrimFlowsCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 TrimFlowsCommand::TrimFlowsCommand(){   
67         try {
68                 abort = true;
69                 //initialize outputTypes
70                 vector<string> tempOutNames;
71                 outputTypes["flow"] = tempOutNames;
72                 outputTypes["fasta"] = tempOutNames;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
76                 exit(1);
77         }
78 }
79
80 //***************************************************************************************************************
81
82 TrimFlowsCommand::~TrimFlowsCommand(){  /*      do nothing      */      }
83
84 //***************************************************************************************************************
85
86 void TrimFlowsCommand::help(){
87         try {
88                 m->mothurOut("The trim.flows command reads a flowgram file and creates .....\n");
89                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
90                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n\n");
91                 
92         }
93         catch(exception& e) {
94                 m->errorOut(e, "TrimFlowsCommand", "help");
95                 exit(1);
96         }
97 }
98
99 //**********************************************************************************************************************
100
101 TrimFlowsCommand::TrimFlowsCommand(string option)  {
102         try {
103                 
104                 abort = false;
105                 comboStarts = 0;
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; }
109                 
110                 else {
111                         //valid paramters for this command
112                         string AlignArray[] =  {"flow", "totalflows", "minflows",
113                                 "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise",
114                                 "oligos", "pdiffs", "bdiffs", "tdiffs", 
115                                 "allfiles", "processors",
116                 
117                                 //                      "group",
118                                 "outputdir","inputdir"
119                                 
120                         };
121                         
122                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
123                         
124                         OptionParser parser(option);
125                         map<string,string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string,string>::iterator it;
129                         
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["flow"] = tempOutNames;
138                         outputTypes["fasta"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("flow");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
151                                 }
152                                 
153                                 it = parameters.find("oligos");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
159                                 }
160                                 
161                                 
162 //                              it = parameters.find("group");
163 //                              //user has given a template file
164 //                              if(it != parameters.end()){ 
165 //                                      path = m->hasPath(it->second);
166 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
167 //                                      if (path == "") {       parameters["group"] = inputDir + it->second;            }
168 //                              }
169                         }
170                         
171                         
172                         //check for required parameters
173                         flowFileName = validParameter.validFile(parameters, "flow", true);
174                         if (flowFileName == "not found") { m->mothurOut("flow is a required parameter for the trim.flows command."); m->mothurOutEndLine(); abort = true; }
175                         else if (flowFileName == "not open") { abort = true; }  
176                         
177                         //if the user changes the output directory command factory will send this info to us in the output parameter 
178                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
179                                 outputDir = ""; 
180                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
181                         }
182                         
183                         
184                         //check for optional parameter and set defaults
185                         // ...at some point should added some additional type checking...
186                         
187                         string temp;
188                         temp = validParameter.validFile(parameters, "minflows", false);         if (temp == "not found") { temp = "360"; }
189                         convert(temp, minFlows);  
190
191                         temp = validParameter.validFile(parameters, "totalflows", false);       if (temp == "not found") { temp = "720"; }
192                         convert(temp, totalFlows);  
193                         
194                         
195                         temp = validParameter.validFile(parameters, "oligos", true);
196                         if (temp == "not found")        {       oligoFileName = "";             }
197                         else if(temp == "not open")     {       abort = true;                   } 
198                         else                                            {       oligoFileName = temp;   }
199                         
200                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
201                         else if(m->isTrue(temp))        {       fasta = 1;      }
202                         
203                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
204                         convert(temp, maxHomoP);  
205
206                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
207                         convert(temp, signal);  
208
209                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
210                         convert(temp, noise);  
211
212                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
213                         convert(temp, minLength); 
214                         
215                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
216                         convert(temp, maxLength);
217                         
218                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
219                         convert(temp, bdiffs);
220                         
221                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
222                         convert(temp, pdiffs);
223                         
224                         temp = validParameter.validFile(parameters, "tdiffs", false);
225                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
226                         convert(temp, tdiffs);
227                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
228                         
229                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "T";          }
230                         allFiles = m->isTrue(temp);
231                         
232                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
233                         convert(temp, processors); 
234                         
235                         if(oligoFileName == ""){        allFiles = 0;           }
236
237                         numFPrimers = 0;
238                         numRPrimers = 0;
239                 }
240                 
241         }
242         catch(exception& e) {
243                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
244                 exit(1);
245         }
246 }
247
248 //***************************************************************************************************************
249
250 int TrimFlowsCommand::execute(){
251         try{
252                 
253                 if (abort == true) { return 0; }
254
255                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
256                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
257                 
258                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
259                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
260
261                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
262                 if(fasta){
263                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
264                 }
265                 
266                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
267                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
268                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
269                 }       
270
271                 vector<vector<string> > barcodePrimerComboFileNames;
272                 if(oligoFileName != ""){
273                         getOligos(barcodePrimerComboFileNames); 
274                 }
275                 
276 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
277                 if(processors == 1){
278                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
279                 }else{
280                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
281                 }       
282 #else
283                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
284 #endif
285                 
286                 if (m->control_pressed) {  return 0; }                  
287                 
288                 if(allFiles){
289                         
290                         ofstream output;
291                         string flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
292                         m->openOutputFile(flowFilesFileName, output);
293
294                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
295                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
296                                         
297                                         FILE * pFile;
298                                         unsigned long int size;
299                                         
300                                         //get num bytes in file
301                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
302                                         if (pFile==NULL) perror ("Error opening file");
303                                         else{
304                                                 fseek (pFile, 0, SEEK_END);
305                                                 size=ftell (pFile);
306                                                 fclose (pFile);
307                                         }
308
309                                         if(size < 10){
310 //                                              m->mothurOut("deleting: " + barcodePrimerComboFileNames[i][j] + '\n');
311                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
312                                         }
313                                         else{
314 //                                              m->mothurOut("saving: " + barcodePrimerComboFileNames[i][j] + '\n');
315                                                 output << barcodePrimerComboFileNames[i][j] << endl;
316                                         }
317                                 }
318                         }
319                         output.close();
320                 }
321                 
322                 m->mothurOutEndLine();
323                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
324                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
325                 m->mothurOutEndLine();
326                 
327                 return 0;       
328         }
329         catch(exception& e) {
330                 m->errorOut(e, "TrimSeqsCommand", "execute");
331                 exit(1);
332         }
333 }
334
335 //***************************************************************************************************************
336
337 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
338         
339         try {
340                 
341                 ofstream trimFlowFile;
342                 m->openOutputFile(trimFlowFileName, trimFlowFile);
343                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
344
345                 ofstream scrapFlowFile;
346                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
347                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
348
349                 if(line->start == 4){
350                         scrapFlowFile << totalFlows << endl;
351                         trimFlowFile << totalFlows << endl;
352                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
353                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
354                                         //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
355                                         ofstream temp;
356                                         m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
357                                         temp << totalFlows << endl;
358                                         temp.close();
359                                 }
360                         }                       
361                 }
362                 
363                 FlowData flowData(numFlows, signal, noise, maxHomoP);
364                 
365                 ofstream fastaFile;
366                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
367                 
368                 ifstream flowFile;
369                 m->openInputFile(flowFileName, flowFile);
370                 
371                 flowFile.seekg(line->start);
372
373                 int count = 0;
374                 bool moreSeqs = 1;
375                 
376                 while(moreSeqs) {
377                         
378                         int success = 1;
379                         int currentSeqDiffs = 0;
380                         string trashCode = "";
381                         
382                         flowData.getNext(flowFile);
383                         flowData.capFlows(totalFlows);  
384                         
385                         Sequence currSeq = flowData.getSequence();
386                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
387                                 success = 0;
388                                 trashCode += 'l';
389                         }
390                         
391                         if(minLength > 0 || maxLength > 0){     //screen to see if sequence is above and below a specific number of bases
392                                 int seqLength = currSeq.getNumBases();
393                                 if(seqLength < minLength || seqLength > maxLength){
394                                         success = 0;
395                                         trashCode += 'l';
396                                 }
397                         }
398                         
399                         int primerIndex, barcodeIndex;
400                         
401                         if(barcodes.size() != 0){
402                                 success = stripBarcode(currSeq, barcodeIndex);
403                                 if(success > bdiffs)            {       trashCode += 'b';       }
404                                 else{ currentSeqDiffs += success;  }
405                         }
406                         
407                         if(numFPrimers != 0){
408                                 success = stripForward(currSeq, primerIndex);
409                                 if(success > pdiffs)            {       trashCode += 'f';       }
410                                 else{ currentSeqDiffs += success;  }
411                         }
412                         
413                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
414                         
415                         if(numRPrimers != 0){
416                                 success = stripReverse(currSeq);
417                                 if(!success)                            {       trashCode += 'r';       }
418                         }
419                                                 
420                                 
421                         if(trashCode.length() == 0){
422                                 flowData.printFlows(trimFlowFile);
423                                 
424                                 if(fasta){
425                                         currSeq.printSequence(fastaFile);
426                                 }
427                                 
428                                 if(barcodes.size() != 0 || primers.size() != 0){
429 //                                      string fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + barcodePrimerCombos[barcodeIndex][primerIndex] + ".flow";
430                                         ofstream output;
431                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
432                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
433                                         
434                                         flowData.printFlows(output);
435                                         output.close();
436                                 }
437                                 
438                         }
439                         else{
440                                 flowData.printFlows(scrapFlowFile, trashCode);
441                         }
442                                 
443                         count++;
444                                                 
445                         //report progress
446                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
447
448 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
449                         unsigned long int pos = flowFile.tellg();
450
451                         if ((pos == -1) || (pos >= line->end)) { break; }
452 #else
453                         if (flowFile.eof()) { break; }
454 #endif
455                         
456                 }
457                 //report progress
458                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
459                 
460                 trimFlowFile.close();
461                 scrapFlowFile.close();
462                 flowFile.close();
463                 if(fasta){      fastaFile.close();      }
464                 
465                 return count;
466         }
467         catch(exception& e) {
468                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
469                 exit(1);
470         }
471 }
472
473 //***************************************************************************************************************
474
475 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
476         try {
477                 ifstream oligosFile;
478                 m->openInputFile(oligoFileName, oligosFile);
479                 
480                 string type, oligo, group;
481
482                 int indexPrimer = 0;
483                 int indexBarcode = 0;
484                 
485                 while(!oligosFile.eof()){
486                 
487                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
488
489                         if(type[0] == '#'){     //igore the line because there's a comment
490                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
491                         }
492                         else{                           //there's a feature we're interested in
493
494                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
495
496                                 oligosFile >> oligo;    //get the DNA sequence for the feature
497
498                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
499                                         oligo[i] = toupper(oligo[i]);
500                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
501                                 }
502
503                                 if(type == "FORWARD"){  //if the feature is a forward primer...
504                                         group = "";
505
506                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
507                                                 char c = oligosFile.get(); 
508                                                 if (c == 10 || c == 13){        break;  }
509                                                 else if (c == 32 || c == 9){;} //space or tab
510                                                 else {  group += c;  }
511                                         } 
512
513                                         //have we seen this primer already?
514                                         map<string, int>::iterator itPrimer = primers.find(oligo);
515                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
516
517                                         primers[oligo]=indexPrimer; indexPrimer++;
518                                         primerNameVector.push_back(group);
519
520                                 }
521                                 else if(type == "REVERSE"){
522                                         Sequence oligoRC("reverse", oligo);
523                                         oligoRC.reverseComplement();
524                                         revPrimer.push_back(oligoRC.getUnaligned());
525                                 }
526                                 else if(type == "BARCODE"){
527                                         oligosFile >> group;
528
529                                         //check for repeat barcodes
530                                         map<string, int>::iterator itBar = barcodes.find(oligo);
531                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
532
533                                         barcodes[oligo]=indexBarcode; indexBarcode++;
534                                         barcodeNameVector.push_back(group);
535                                 }
536                                 else{
537                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
538                                 }
539                         }
540
541                         m->gobble(oligosFile);
542                 }
543                 oligosFile.close();
544                 
545                 //add in potential combos       
546                 outFlowFileNames.resize(barcodeNameVector.size());
547                 for(int i=0;i<outFlowFileNames.size();i++){
548                         outFlowFileNames[i].assign(primerNameVector.size(), "");
549                 }
550                 
551                 if(allFiles){
552                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
553                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
554
555                                         string primerName = primerNameVector[itPrimer->second];
556                                         string barcodeName = barcodeNameVector[itBar->second];
557                                                                                 
558                                         string comboGroupName = "";
559                                         string fileName = "";
560                                         
561                                         if(primerName == ""){
562                                                 comboGroupName = barcodeNameVector[itBar->second];
563                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
564                                         }
565                                         else{
566                                                 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
567                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
568                                         }
569                                         
570                                         outputNames.push_back(fileName);
571                                         outputTypes["flow"].push_back(fileName);
572                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
573                                         
574                                         ofstream temp;
575                                         m->openOutputFile(fileName, temp);
576                                         temp.close();
577                                 }
578                         }
579                 }
580                 
581                 numFPrimers = primers.size();
582                 numRPrimers = revPrimer.size();
583                 
584         }
585         catch(exception& e) {
586                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
587                 exit(1);
588         }
589 }
590
591 //***************************************************************************************************************
592
593 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
594         try {
595                 
596                 string rawSequence = seq.getUnaligned();
597                 int success = bdiffs + 1;       //guilty until proven innocent
598                 
599                 //can you find the barcode
600                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
601                         string oligo = it->first;
602                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
603                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
604                                 break;  
605                         }
606                         
607                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
608                                 group = it->second;
609                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
610                                 
611                                 success = 0;
612                                 break;
613                         }
614                 }
615                 
616                 //if you found the barcode or if you don't want to allow for diffs
617                 if ((bdiffs == 0) || (success == 0)) { return success;  }
618                 
619                 else { //try aligning and see if you can find it
620                         
621                         int maxLength = 0;
622                         
623                         Alignment* alignment;
624                         if (barcodes.size() > 0) {
625                                 map<string,int>::iterator it=barcodes.begin();
626                                 
627                                 for(it;it!=barcodes.end();it++){
628                                         if(it->first.length() > maxLength){
629                                                 maxLength = it->first.length();
630                                         }
631                                 }
632                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
633                                 
634                         }else{ alignment = NULL; } 
635                         
636                         //can you find the barcode
637                         int minDiff = 1e6;
638                         int minCount = 1;
639                         int minGroup = -1;
640                         int minPos = 0;
641                         
642                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
643                                 string oligo = it->first;
644                                 //                              int length = oligo.length();
645                                 
646                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
647                                         success = bdiffs + 10;
648                                         break;
649                                 }
650                                 
651                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
652                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
653                                 oligo = alignment->getSeqAAln();
654                                 string temp = alignment->getSeqBAln();
655                                 
656                                 int alnLength = oligo.length();
657                                 
658                                 for(int i=oligo.length()-1;i>=0;i--){
659                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
660                                 }
661                                 oligo = oligo.substr(0,alnLength);
662                                 temp = temp.substr(0,alnLength);
663                                 
664                                 int newStart=0;
665                                 int numDiff = countDiffs(oligo, temp);
666                                 
667                                 if(numDiff < minDiff){
668                                         minDiff = numDiff;
669                                         minCount = 1;
670                                         minGroup = it->second;
671                                         minPos = 0;
672                                         for(int i=0;i<alnLength;i++){
673                                                 if(temp[i] != '-'){
674                                                         minPos++;
675                                                 }
676                                         }
677                                 }
678                                 else if(numDiff == minDiff){
679                                         minCount++;
680                                 }
681                                 
682                         }
683                         
684                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
685                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
686                         else{                                                                                                   //use the best match
687                                 group = minGroup;
688                                 seq.setUnaligned(rawSequence.substr(minPos));
689                                 success = minDiff;
690                         }
691                         
692                         if (alignment != NULL) {  delete alignment;  }
693                         
694                 }
695                 
696                 return success;
697                 
698         }
699         catch(exception& e) {
700                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
701                 exit(1);
702         }
703         
704 }
705
706 //***************************************************************************************************************
707
708 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
709         try {
710                 
711                 string rawSequence = seq.getUnaligned();
712                 int success = pdiffs + 1;       //guilty until proven innocent
713                 
714                 //can you find the primer
715                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
716                         string oligo = it->first;
717                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
718                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
719                                 break;  
720                         }
721                         
722                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
723                                 group = it->second;
724                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
725                                 success = 0;
726                                 break;
727                         }
728                 }
729                 
730                 //if you found the barcode or if you don't want to allow for diffs
731                 if ((pdiffs == 0) || (success == 0)) { return success;  }
732                 
733                 else { //try aligning and see if you can find it
734                         
735                         int maxLength = 0;
736                         
737                         Alignment* alignment;
738                         if (primers.size() > 0) {
739                                 map<string,int>::iterator it=primers.begin();
740                                 
741                                 for(it;it!=primers.end();it++){
742                                         if(it->first.length() > maxLength){
743                                                 maxLength = it->first.length();
744                                         }
745                                 }
746                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
747                                 
748                         }else{ alignment = NULL; } 
749                         
750                         //can you find the barcode
751                         int minDiff = 1e6;
752                         int minCount = 1;
753                         int minGroup = -1;
754                         int minPos = 0;
755                         
756                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
757                                 string oligo = it->first;
758                                 //                              int length = oligo.length();
759                                 
760                                 if(rawSequence.length() < maxLength){   
761                                         success = pdiffs + 100;
762                                         break;
763                                 }
764                                 
765                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
766                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
767                                 oligo = alignment->getSeqAAln();
768                                 string temp = alignment->getSeqBAln();
769                                 
770                                 int alnLength = oligo.length();
771                                 
772                                 for(int i=oligo.length()-1;i>=0;i--){
773                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
774                                 }
775                                 oligo = oligo.substr(0,alnLength);
776                                 temp = temp.substr(0,alnLength);
777                                 
778                                 int newStart=0;
779                                 int numDiff = countDiffs(oligo, temp);
780                                 
781                                 if(numDiff < minDiff){
782                                         minDiff = numDiff;
783                                         minCount = 1;
784                                         minGroup = it->second;
785                                         minPos = 0;
786                                         for(int i=0;i<alnLength;i++){
787                                                 if(temp[i] != '-'){
788                                                         minPos++;
789                                                 }
790                                         }
791                                 }
792                                 else if(numDiff == minDiff){
793                                         minCount++;
794                                 }
795                                 
796                         }
797                         
798                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
799                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
800                         else{                                                                                                   //use the best match
801                                 group = minGroup;
802                                 seq.setUnaligned(rawSequence.substr(minPos));
803                                 success = minDiff;
804                         }
805                         
806                         if (alignment != NULL) {  delete alignment;  }
807                         
808                 }
809                 
810                 return success;
811                 
812         }
813         catch(exception& e) {
814                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
815                 exit(1);
816         }
817 }
818
819 //***************************************************************************************************************
820
821 bool TrimFlowsCommand::stripReverse(Sequence& seq){
822         try {
823
824                 string rawSequence = seq.getUnaligned();
825                 bool success = 0;       //guilty until proven innocent
826                 
827                 for(int i=0;i<numRPrimers;i++){
828                         string oligo = revPrimer[i];
829                         
830                         if(rawSequence.length() < oligo.length()){
831                                 success = 0;
832                                 break;
833                         }
834                         
835                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
836                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
837                                 success = 1;
838                                 break;
839                         }
840                 }       
841
842                 return success;
843                 
844         }
845         catch(exception& e) {
846                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
847                 exit(1);
848         }
849 }
850
851
852 //***************************************************************************************************************
853
854 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
855         try {
856                 bool success = 1;
857                 int length = oligo.length();
858                 
859                 for(int i=0;i<length;i++){
860                         
861                         if(oligo[i] != seq[i]){
862                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
863                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
864                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
865                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
866                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
867                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
868                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
869                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
870                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
871                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
872                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
873                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
874                                 
875                                 if(success == 0)        {       break;   }
876                         }
877                         else{
878                                 success = 1;
879                         }
880                 }
881                 
882                 return success;
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
886                 exit(1);
887         }
888         
889 }
890
891 //***************************************************************************************************************
892
893 int TrimFlowsCommand::countDiffs(string oligo, string seq){
894         try {
895                 
896                 int length = oligo.length();
897                 int countDiffs = 0;
898                 
899                 for(int i=0;i<length;i++){
900                         
901                         if(oligo[i] != seq[i]){
902                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
903                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
904                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
905                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
906                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
907                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
908                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
909                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
910                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
911                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
912                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
913                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
914                         }
915                         
916                 }
917                 
918                 return countDiffs;
919         }
920         catch(exception& e) {
921                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
922                 exit(1);
923         }
924         
925 }
926
927 /**************************************************************************************************/
928
929 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
930
931         try{
932                         
933                 vector<unsigned long int> filePos;
934                 filePos.push_back(0);
935                                         
936                 FILE * pFile;
937                 unsigned long int size;
938                 
939                 //get num bytes in file
940                 pFile = fopen (flowFileName.c_str(),"rb");
941                 if (pFile==NULL) perror ("Error opening file");
942                 else{
943                         fseek (pFile, 0, SEEK_END);
944                         size=ftell (pFile);
945                         fclose (pFile);
946                 }
947                                 
948                 //estimate file breaks
949                 unsigned long int chunkSize = 0;
950                 chunkSize = size / processors;
951
952                 //file too small to divide by processors
953                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
954                 
955                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
956                 for (int i = 0; i < processors; i++) {
957                         unsigned long int spot = (i+1) * chunkSize;
958                         
959                         ifstream in;
960                         m->openInputFile(flowFileName, in);
961                         in.seekg(spot);
962                         
963                         unsigned long int newSpot = spot;
964                         string dummy = m->getline(in);
965                         
966                         
967                         //there was not another sequence before the end of the file
968                         unsigned long int sanityPos = in.tellg();
969                         
970 //                      if (sanityPos == -1) {  break;  }
971 //                      else {  filePos.push_back(newSpot);  }
972                         if (sanityPos == -1) {  break;  }
973                         else {  filePos.push_back(sanityPos);  }
974                         
975                         in.close();
976                 }
977                 
978                 //save end pos
979                 filePos.push_back(size);
980                 
981                 //sanity check filePos
982                 for (int i = 0; i < (filePos.size()-1); i++) {
983                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
984                 }
985
986                 ifstream in;
987                 m->openInputFile(flowFileName, in);
988                 in >> numFlows;
989                 m->gobble(in);
990                 unsigned long int spot = in.tellg();
991                 filePos[0] = spot;
992                 in.close();
993                 
994                 processors = (filePos.size() - 1);
995                 
996                 return filePos; 
997         }
998         catch(exception& e) {
999                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
1000                 exit(1);
1001         }
1002 }
1003
1004 /**************************************************************************************************/
1005
1006 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
1007
1008         try {
1009 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1010                 int process = 1;
1011                 int exitCommand = 1;
1012                 processIDS.clear();
1013                 
1014                 //loop through and create all the processes you want
1015                 while (process != processors) {
1016                         int pid = fork();
1017                         
1018                         if (pid > 0) {
1019                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1020                                 process++;
1021                         }else if (pid == 0){
1022                                 
1023                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1024                                 for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1025                                         for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1026                                                 tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1027                                                 ofstream temp;
1028                                                 m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1029                                                 temp.close();
1030                                                 
1031                                         }
1032                                 }
1033                                                 
1034                                 driverCreateTrim(flowFileName,
1035                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1036                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1037                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1038                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1039
1040                                 exit(0);
1041                         }else { 
1042                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1043                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1044                                 exit(0);
1045                         }
1046                 }
1047                 
1048                 //parent do my part
1049                 ofstream temp;
1050                 m->openOutputFile(trimFlowFileName, temp);
1051                 temp.close();
1052
1053                 m->openOutputFile(scrapFlowFileName, temp);
1054                 temp.close();
1055                 
1056                 if(fasta){
1057                         m->openOutputFile(fastaFileName, temp);
1058                         temp.close();
1059                 }
1060                 
1061                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1062
1063                 //force parent to wait until all the processes are done
1064                 for (int i=0;i<processIDS.size();i++) { 
1065                         int temp = processIDS[i];
1066                         wait(&temp);
1067                 }
1068                 
1069                 //append files
1070                 m->mothurOutEndLine();
1071                 for(int i=0;i<processIDS.size();i++){
1072                         
1073                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1074                         
1075                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1076                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1077 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1078
1079                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1080                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1081 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1082
1083                         if(fasta){
1084                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1085                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1086 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1087                         }
1088                         if(allFiles){                                           
1089                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1090                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1091                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1092                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1093                                         }
1094                                 }
1095                         }
1096                 }
1097                 
1098                 return exitCommand;
1099 #endif          
1100         }
1101         catch(exception& e) {
1102                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1103                 exit(1);
1104         }
1105 }
1106
1107 //***************************************************************************************************************