]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
addition of trim.flows
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimFlowsCommand::getValidParameters(){  
16         try {
17                 string Array[] =  {"flow", "totalflows", "minflows",
18                         "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise"
19                         "oligos", "pdiffs", "bdiffs", "tdiffs", 
20                         "allfiles",
21
22                         
23 //                      "group",
24 //                      "processors",
25                         "outputdir","inputdir"
26                 
27                 };
28                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "TrimFlowsCommand", "getValidParameters");
33                 exit(1);
34         }
35 }
36
37 //**********************************************************************************************************************
38
39 vector<string> TrimFlowsCommand::getRequiredParameters(){       
40         try {
41                 string Array[] =  {"flow"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimFlowsCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50
51 //**********************************************************************************************************************
52
53 vector<string> TrimFlowsCommand::getRequiredFiles(){    
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "TrimFlowsCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 TrimFlowsCommand::TrimFlowsCommand(){   
67         try {
68                 abort = true;
69                 //initialize outputTypes
70                 vector<string> tempOutNames;
71                 outputTypes["flow"] = tempOutNames;
72                 outputTypes["fasta"] = tempOutNames;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
76                 exit(1);
77         }
78 }
79
80 //***************************************************************************************************************
81
82 TrimFlowsCommand::~TrimFlowsCommand(){  /*      do nothing      */      }
83
84 //***************************************************************************************************************
85
86 void TrimFlowsCommand::help(){
87         try {
88                 m->mothurOut("The trim.flows command reads a flowgram file and creates .....\n");
89                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
90                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n\n");
91                 
92         }
93         catch(exception& e) {
94                 m->errorOut(e, "TrimFlowsCommand", "help");
95                 exit(1);
96         }
97 }
98
99 //**********************************************************************************************************************
100
101 TrimFlowsCommand::TrimFlowsCommand(string option)  {
102         try {
103                 
104                 abort = false;
105                 comboStarts = 0;
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; }
109                 
110                 else {
111                         //valid paramters for this command
112                         string AlignArray[] =  {"flow", "totalflows", "minflows",
113                                 "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise",
114                                 "oligos", "pdiffs", "bdiffs", "tdiffs", 
115                                 "allfiles",
116                 
117                                 //                      "group",
118                                 //                      "processors",
119                                 "outputdir","inputdir"
120                                 
121                         };
122                         
123                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
124                         
125                         OptionParser parser(option);
126                         map<string,string> parameters = parser.getParameters();
127                         
128                         ValidParameters validParameter;
129                         map<string,string>::iterator it;
130                         
131                         //check to make sure all parameters are valid for command
132                         for (it = parameters.begin(); it != parameters.end(); it++) { 
133                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
134                         }
135                         
136                         //initialize outputTypes
137                         vector<string> tempOutNames;
138                         outputTypes["flow"] = tempOutNames;
139                         //              outputTypes["fasta"] = tempOutNames;
140                         //              outputTypes["group"] = tempOutNames;
141                         
142                         //if the user changes the input directory command factory will send this info to us in the output parameter 
143                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
144                         if (inputDir == "not found"){   inputDir = "";          }
145                         else {
146                                 string path;
147                                 it = parameters.find("flow");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
153                                 }
154                                 
155 //                              it = parameters.find("oligos");
156 //                              //user has given a template file
157 //                              if(it != parameters.end()){ 
158 //                                      path = m->hasPath(it->second);
159 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
160 //                                      if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
161 //                              }
162                                 
163                                 
164 //                              it = parameters.find("group");
165 //                              //user has given a template file
166 //                              if(it != parameters.end()){ 
167 //                                      path = m->hasPath(it->second);
168 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
169 //                                      if (path == "") {       parameters["group"] = inputDir + it->second;            }
170 //                              }
171                         }
172                         
173                         
174                         //check for required parameters
175                         flowFileName = validParameter.validFile(parameters, "flow", true);
176                         if (flowFileName == "not found") { m->mothurOut("flow is a required parameter for the trim.flows command."); m->mothurOutEndLine(); abort = true; }
177                         else if (flowFileName == "not open") { abort = true; }  
178                         
179                         //if the user changes the output directory command factory will send this info to us in the output parameter 
180                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
181                                 outputDir = ""; 
182                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
183                         }
184                         
185                         
186                         //check for optional parameter and set defaults
187                         // ...at some point should added some additional type checking...
188                         
189                         string temp;
190                         temp = validParameter.validFile(parameters, "minflows", false);         if (temp == "not found") { temp = "360"; }
191                         convert(temp, minFlows);  
192
193                         temp = validParameter.validFile(parameters, "totalflows", false);       if (temp == "not found") { temp = "720"; }
194                         convert(temp, totalFlows);  
195                         
196                         
197                         temp = validParameter.validFile(parameters, "oligos", true);
198                         if (temp == "not found")        {       oligoFileName = "";             }
199                         else if(temp == "not open")     {       abort = true;                   } 
200                         else                                            {       oligoFileName = temp;   }
201                         
202                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
203                         else if(m->isTrue(temp))        {       fasta = 1;      }
204                         
205                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
206                         convert(temp, maxHomoP);  
207
208                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
209                         convert(temp, signal);  
210
211                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
212                         convert(temp, noise);  
213
214                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
215                         convert(temp, minLength); 
216                         
217                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
218                         convert(temp, maxLength);
219                         
220                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
221                         convert(temp, bdiffs);
222                         
223                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
224                         convert(temp, pdiffs);
225                         
226                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
227                         convert(temp, tdiffs);
228                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
229                         
230                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "T";          }
231                         allFiles = m->isTrue(temp);
232                         
233 //                      temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
234 //                      convert(temp, processors); 
235                         
236                         if(allFiles && oligoFileName == ""){
237                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
238                                 allFiles = 0;
239                         }
240
241                 }
242                 
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
246                 exit(1);
247         }
248 }
249
250 //***************************************************************************************************************
251
252 int TrimFlowsCommand::execute(){
253         try{
254                 
255                 if (abort == true) { return 0; }
256
257                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
258                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
259                 
260                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
261                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
262
263                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
264                 if(fasta){
265                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
266                 }
267                 
268                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName);
269                 
270                 return 0;                       
271         }
272         catch(exception& e) {
273                 m->errorOut(e, "TrimSeqsCommand", "execute");
274                 exit(1);
275         }
276 }
277
278 //***************************************************************************************************************
279
280 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName){
281         
282         try {
283                 
284                 ofstream trimFlowFile;
285                 m->openOutputFile(trimFlowFileName, trimFlowFile);
286                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
287
288                 ofstream scrapFlowFile;
289                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
290                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
291         
292                 ifstream flowFile;
293                 m->openInputFile(flowFileName, flowFile);
294                 
295                 ofstream fastaFile;
296                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
297                 
298                 vector<vector<string> > barcodePrimerCombos;    
299                 if(oligoFileName != ""){        getOligos(barcodePrimerCombos); }
300                 
301 //              inFASTA.seekg(line->start);
302                 
303 //              bool done = false;
304                 int count = 0;
305                 
306                 while (flowFile) {
307                         
308                         int success = 1;
309                         int currentSeqDiffs = 0;
310                         
311                         string trashCode = "";
312
313                         FlowData currFlow(flowFile, signal, noise, maxHomoP);
314                         m->gobble(flowFile);
315                         currFlow.capFlows(totalFlows);  
316                         Sequence currSeq = currFlow.getSequence();
317                         
318                         
319                         if(!currFlow.hasMinFlows(minFlows)){            //screen to see if sequence is of a minimum number of flows
320                                 success = 0;
321                                 trashCode += 'l';
322                         }
323                         
324                         if(minLength > 0 || maxLength > 0){                     //screen to see if sequence is above and below a specific number of bases
325                                 int seqLength = currFlow.getSeqLength();
326                                 if(seqLength < minLength || seqLength > maxLength){
327                                         success = 0;
328                                         trashCode += 'l';
329                                 }
330                         }
331                         
332                         int primerIndex, barcodeIndex;
333                         if(barcodes.size() != 0){
334                                 success = stripBarcode(currSeq, barcodeIndex);
335                                 if(success > bdiffs)            {       trashCode += 'b';       }
336                                 else{ currentSeqDiffs += success;  }
337                         }
338                         
339                         if(numFPrimers != 0){
340                                 success = stripForward(currSeq, primerIndex);
341                                 if(success > pdiffs)            {       trashCode += 'f';       }
342                                 else{ currentSeqDiffs += success;  }
343                         }
344                         
345                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
346                         
347                         if(numRPrimers != 0){
348                                 success = stripReverse(currSeq);
349                                 if(!success)                            {       trashCode += 'r';       }
350                         }
351                                                 
352                                 
353                         if(trashCode.length() == 0){
354                                 currFlow.printFlows(trimFlowFile);
355                                 
356                                 if(fasta){
357                                         currFlow.printFASTA(fastaFile);
358                                 }
359                                 
360                                 if(barcodes.size() != 0 || primers.size() != 0){
361                                         string fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + barcodePrimerCombos[barcodeIndex][primerIndex] + ".flow";
362                                         ofstream output;
363                                         m->openOutputFileAppend(fileName, output);
364                                         currFlow.printFlows(output);
365                                         output.close();
366                                 }
367                                 
368                         }
369                         else{
370                                 currFlow.printFlows(scrapFlowFile, trashCode);
371                         }
372                                 
373 //                              if(barcodes.size() != 0){
374 //                                      string thisGroup = groupVector[groupBar];
375 //                                      int indexToFastaFile = groupBar;
376 //                                      if (primers.size() != 0){
377 //                                              //does this primer have a group
378 //                                              if (groupVector[groupPrime] != "") {  
379 //                                                      thisGroup += "." + groupVector[groupPrime]; 
380 //                                                      indexToFastaFile = combos[thisGroup];
381 //                                              }
382 //                                      }
383 //                                      outGroups << currSeq.getName() << '\t' << thisGroup << endl;
384 //                                      if(allFiles){
385 //                                              ofstream outTemp;
386 //                                              m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp);
387 //                                              //currSeq.printSequence(*fastaFileNames[indexToFastaFile]);
388 //                                              currSeq.printSequence(outTemp);
389 //                                              outTemp.close();
390 //                                              
391 //                                              if(qFileName != ""){
392 //                                                      //currQual.printQScores(*qualFileNames[indexToFastaFile]);
393 //                                                      ofstream outTemp2;
394 //                                                      m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2);
395 //                                                      currQual.printQScores(outTemp2);
396 //                                                      outTemp2.close();                                                       
397 //                                              }
398 //                                      }
399 //                              }
400 //                      }
401 //                      else{
402 //                              currSeq.setName(currSeq.getName() + '|' + trashCode);
403 //                              currSeq.setUnaligned(origSeq);
404 //                              currSeq.setAligned(origSeq);
405 //                              currSeq.printSequence(scrapFASTA);
406 //                      }
407
408                         count++;
409                                                 
410                         //report progress
411                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
412                         
413                 }
414                 //report progress
415                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
416                 
417                 
418                 trimFlowFile.close();
419                 scrapFlowFile.close();
420                 flowFile.close();
421                 
422                 return count;
423         }
424         catch(exception& e) {
425                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
426                 exit(1);
427         }
428 }
429
430 //***************************************************************************************************************
431
432 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
433         try {
434                 ifstream oligosFile;
435                 m->openInputFile(oligoFileName, oligosFile);
436                 
437                 string type, oligo, group;
438
439                 int indexPrimer = 0;
440                 int indexBarcode = 0;
441                 
442                 while(!oligosFile.eof()){
443                 
444                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
445
446                         if(type[0] == '#'){     //igore the line because there's a comment
447                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
448                         }
449                         else{                           //there's a feature we're interested in
450
451                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
452
453                                 oligosFile >> oligo;    //get the DNA sequence for the feature
454
455                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
456                                         oligo[i] = toupper(oligo[i]);
457                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
458                                 }
459
460                                 if(type == "FORWARD"){  //if the feature is a forward primer...
461                                         group = "";
462
463                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
464                                                 char c = oligosFile.get(); 
465                                                 if (c == 10 || c == 13){        break;  }
466                                                 else if (c == 32 || c == 9){;} //space or tab
467                                                 else {  group += c;  }
468                                         } 
469
470                                         //have we seen this primer already?
471                                         map<string, int>::iterator itPrimer = primers.find(oligo);
472                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
473
474                                         primers[oligo]=indexPrimer; indexPrimer++;
475                                         primerNameVector.push_back(group);
476
477                                 }
478                                 else if(type == "REVERSE"){
479                                         Sequence oligoRC("reverse", oligo);
480                                         oligoRC.reverseComplement();
481                                         revPrimer.push_back(oligoRC.getUnaligned());
482                                 }
483                                 else if(type == "BARCODE"){
484                                         oligosFile >> group;
485
486                                         //check for repeat barcodes
487                                         map<string, int>::iterator itBar = barcodes.find(oligo);
488                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
489
490                                         barcodes[oligo]=indexBarcode; indexBarcode++;
491                                         barcodeNameVector.push_back(group);
492                                 }
493                                 else{
494                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
495                                 }
496                         }
497
498                         m->gobble(oligosFile);
499                 }
500                 oligosFile.close();
501                 
502                 //add in potential combos       
503                 outFlowFileNames.resize(barcodeNameVector.size());
504                 for(int i=0;i<outFlowFileNames.size();i++){
505                         outFlowFileNames[i].assign(primerNameVector.size(), "");
506                 }
507                 
508                 if(allFiles){
509                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
510                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
511
512                                         string primerName = primerNameVector[itPrimer->second];
513                                         string barcodeName = barcodeNameVector[itBar->second];
514                                                                                 
515                                         string comboGroupName = "";
516                                         string fileName = "";
517                                         
518                                         if(primerName == ""){
519                                                 comboGroupName = barcodeNameVector[itBar->second];
520                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
521                                         }
522                                         else{
523                                                 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
524                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
525                                         }
526                                         
527                                         outputNames.push_back(fileName);
528                                         outputTypes["flow"].push_back(fileName);
529                                         outFlowFileNames[itBar->second][itPrimer->second] = comboGroupName;
530                                         
531                                         ofstream temp;
532                                         m->openOutputFile(fileName, temp);
533                                         temp.close();
534                                 }
535                         }
536                 }
537                 
538                 numFPrimers = primers.size();
539                 numRPrimers = revPrimer.size();
540                 
541         }
542         catch(exception& e) {
543                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
544                 exit(1);
545         }
546 }
547
548 //***************************************************************************************************************
549
550 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
551         try {
552                 
553                 string rawSequence = seq.getUnaligned();
554                 int success = bdiffs + 1;       //guilty until proven innocent
555                 
556                 //can you find the barcode
557                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
558                         string oligo = it->first;
559                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
560                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
561                                 break;  
562                         }
563                         
564                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
565                                 group = it->second;
566                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
567                                 
568                                 success = 0;
569                                 break;
570                         }
571                 }
572                 
573                 //if you found the barcode or if you don't want to allow for diffs
574                 //              cout << success;
575                 if ((bdiffs == 0) || (success == 0)) { return success;  }
576                 
577                 else { //try aligning and see if you can find it
578                         //                      cout << endl;
579                         
580                         int maxLength = 0;
581                         
582                         Alignment* alignment;
583                         if (barcodes.size() > 0) {
584                                 map<string,int>::iterator it=barcodes.begin();
585                                 
586                                 for(it;it!=barcodes.end();it++){
587                                         if(it->first.length() > maxLength){
588                                                 maxLength = it->first.length();
589                                         }
590                                 }
591                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
592                                 
593                         }else{ alignment = NULL; } 
594                         
595                         //can you find the barcode
596                         int minDiff = 1e6;
597                         int minCount = 1;
598                         int minGroup = -1;
599                         int minPos = 0;
600                         
601                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
602                                 string oligo = it->first;
603                                 //                              int length = oligo.length();
604                                 
605                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
606                                         success = bdiffs + 10;
607                                         break;
608                                 }
609                                 
610                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
611                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
612                                 oligo = alignment->getSeqAAln();
613                                 string temp = alignment->getSeqBAln();
614                                 
615                                 int alnLength = oligo.length();
616                                 
617                                 for(int i=oligo.length()-1;i>=0;i--){
618                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
619                                 }
620                                 oligo = oligo.substr(0,alnLength);
621                                 temp = temp.substr(0,alnLength);
622                                 
623                                 int newStart=0;
624                                 int numDiff = countDiffs(oligo, temp);
625                                 
626                                 if(numDiff < minDiff){
627                                         minDiff = numDiff;
628                                         minCount = 1;
629                                         minGroup = it->second;
630                                         minPos = 0;
631                                         for(int i=0;i<alnLength;i++){
632                                                 if(temp[i] != '-'){
633                                                         minPos++;
634                                                 }
635                                         }
636                                 }
637                                 else if(numDiff == minDiff){
638                                         minCount++;
639                                 }
640                                 
641                         }
642                         
643                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
644                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
645                         else{                                                                                                   //use the best match
646                                 group = minGroup;
647                                 seq.setUnaligned(rawSequence.substr(minPos));
648                                 success = minDiff;
649                         }
650                         
651                         if (alignment != NULL) {  delete alignment;  }
652                         
653                 }
654                 
655                 return success;
656                 
657         }
658         catch(exception& e) {
659                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
660                 exit(1);
661         }
662         
663 }
664
665 //***************************************************************************************************************
666
667 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
668         try {
669                 
670                 string rawSequence = seq.getUnaligned();
671                 int success = pdiffs + 1;       //guilty until proven innocent
672                 
673                 //can you find the primer
674                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
675                         string oligo = it->first;
676                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
677                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
678                                 break;  
679                         }
680                         
681                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
682                                 group = it->second;
683                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
684                                 success = 0;
685                                 break;
686                         }
687                 }
688                 
689                 //if you found the barcode or if you don't want to allow for diffs
690                 //              cout << success;
691                 if ((pdiffs == 0) || (success == 0)) { return success;  }
692                 
693                 else { //try aligning and see if you can find it
694                         //                      cout << endl;
695                         
696                         int maxLength = 0;
697                         
698                         Alignment* alignment;
699                         if (primers.size() > 0) {
700                                 map<string,int>::iterator it=primers.begin();
701                                 
702                                 for(it;it!=primers.end();it++){
703                                         if(it->first.length() > maxLength){
704                                                 maxLength = it->first.length();
705                                         }
706                                 }
707                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
708                                 
709                         }else{ alignment = NULL; } 
710                         
711                         //can you find the barcode
712                         int minDiff = 1e6;
713                         int minCount = 1;
714                         int minGroup = -1;
715                         int minPos = 0;
716                         
717                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
718                                 string oligo = it->first;
719                                 //                              int length = oligo.length();
720                                 
721                                 if(rawSequence.length() < maxLength){   
722                                         success = pdiffs + 100;
723                                         break;
724                                 }
725                                 
726                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
727                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
728                                 oligo = alignment->getSeqAAln();
729                                 string temp = alignment->getSeqBAln();
730                                 
731                                 int alnLength = oligo.length();
732                                 
733                                 for(int i=oligo.length()-1;i>=0;i--){
734                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
735                                 }
736                                 oligo = oligo.substr(0,alnLength);
737                                 temp = temp.substr(0,alnLength);
738                                 
739                                 int newStart=0;
740                                 int numDiff = countDiffs(oligo, temp);
741                                 
742                                 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
743                                 
744                                 if(numDiff < minDiff){
745                                         minDiff = numDiff;
746                                         minCount = 1;
747                                         minGroup = it->second;
748                                         minPos = 0;
749                                         for(int i=0;i<alnLength;i++){
750                                                 if(temp[i] != '-'){
751                                                         minPos++;
752                                                 }
753                                         }
754                                 }
755                                 else if(numDiff == minDiff){
756                                         minCount++;
757                                 }
758                                 
759                         }
760                         
761                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
762                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
763                         else{                                                                                                   //use the best match
764                                 group = minGroup;
765                                 seq.setUnaligned(rawSequence.substr(minPos));
766                                 success = minDiff;
767                         }
768                         
769                         if (alignment != NULL) {  delete alignment;  }
770                         
771                 }
772                 
773                 return success;
774                 
775         }
776         catch(exception& e) {
777                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
778                 exit(1);
779         }
780 }
781
782 //***************************************************************************************************************
783
784 bool TrimFlowsCommand::stripReverse(Sequence& seq){
785         try {
786
787                 string rawSequence = seq.getUnaligned();
788                 bool success = 0;       //guilty until proven innocent
789                 
790                 for(int i=0;i<numRPrimers;i++){
791                         string oligo = revPrimer[i];
792                         
793                         if(rawSequence.length() < oligo.length()){
794                                 success = 0;
795                                 break;
796                         }
797                         
798                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
799                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
800                                 success = 1;
801                                 break;
802                         }
803                 }       
804
805                 return success;
806                 
807         }
808         catch(exception& e) {
809                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
810                 exit(1);
811         }
812 }
813
814
815 //***************************************************************************************************************
816
817 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
818         try {
819                 bool success = 1;
820                 int length = oligo.length();
821                 
822                 for(int i=0;i<length;i++){
823                         
824                         if(oligo[i] != seq[i]){
825                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
826                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
827                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
828                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
829                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
830                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
831                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
832                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
833                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
834                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
835                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
836                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
837                                 
838                                 if(success == 0)        {       break;   }
839                         }
840                         else{
841                                 success = 1;
842                         }
843                 }
844                 
845                 return success;
846         }
847         catch(exception& e) {
848                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
849                 exit(1);
850         }
851         
852 }
853
854 //***************************************************************************************************************
855
856 int TrimFlowsCommand::countDiffs(string oligo, string seq){
857         try {
858                 
859                 int length = oligo.length();
860                 int countDiffs = 0;
861                 
862                 for(int i=0;i<length;i++){
863                         
864                         if(oligo[i] != seq[i]){
865                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
866                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
867                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
868                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
869                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
870                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
871                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
872                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
873                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
874                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
875                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
876                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
877                         }
878                         
879                 }
880                 
881                 return countDiffs;
882         }
883         catch(exception& e) {
884                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
885                 exit(1);
886         }
887         
888 }
889
890 //***************************************************************************************************************
891
892
893
894
895
896
897
898
899
900
901
902
903