]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
fixed bug in unifrac commands with unrooted trees
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimSeqsCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", 
18                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19                                                                         "keepfirst", "removelast",
20                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29
30 //**********************************************************************************************************************
31
32 TrimSeqsCommand::TrimSeqsCommand(){     
33         try {
34                 abort = true; calledHelp = true; 
35                 vector<string> tempOutNames;
36                 outputTypes["fasta"] = tempOutNames;
37                 outputTypes["qual"] = tempOutNames;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
41                 exit(1);
42         }
43 }
44
45 //**********************************************************************************************************************
46
47 vector<string> TrimSeqsCommand::getRequiredParameters(){        
48         try {
49                 string Array[] =  {"fasta"};
50                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
51                 return myArray;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
55                 exit(1);
56         }
57 }
58
59 //**********************************************************************************************************************
60
61 vector<string> TrimSeqsCommand::getRequiredFiles(){     
62         try {
63                 vector<string> myArray;
64                 return myArray;
65         }
66         catch(exception& e) {
67                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
68                 exit(1);
69         }
70 }
71
72 //***************************************************************************************************************
73
74 TrimSeqsCommand::TrimSeqsCommand(string option)  {
75         try {
76                 
77                 abort = false; calledHelp = false;   
78                 comboStarts = 0;
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 
83                 else {
84                         //valid paramters for this command
85                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
86                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
87                                                                 "keepfirst", "removelast",
88                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
89                         
90                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter;
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["fasta"] = tempOutNames;
106                         outputTypes["qual"] = tempOutNames;
107                         
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("fasta");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
119                                 }
120                                 
121                                 it = parameters.find("oligos");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
127                                 }
128                                 
129                                 it = parameters.find("qfile");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
135                                 }
136                                 
137                         }
138
139                         
140                         //check for required parameters
141                         fastaFile = validParameter.validFile(parameters, "fasta", true);
142                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
143                         else if (fastaFile == "not open") { abort = true; }     
144                         
145                         //if the user changes the output directory command factory will send this info to us in the output parameter 
146                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
147                                 outputDir = ""; 
148                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
149                         }
150                 
151                         
152                         //check for optional parameter and set defaults
153                         // ...at some point should added some additional type checking...
154                         string temp;
155                         temp = validParameter.validFile(parameters, "flip", false);
156                         if (temp == "not found"){       flip = 0;       }
157                         else if(m->isTrue(temp))        {       flip = 1;       }
158                 
159                         temp = validParameter.validFile(parameters, "oligos", true);
160                         if (temp == "not found"){       oligoFile = "";         }
161                         else if(temp == "not open"){    abort = true;   } 
162                         else                                    {       oligoFile = temp;               }
163                         
164                         
165                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
166                         convert(temp, maxAmbig);  
167
168                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
169                         convert(temp, maxHomoP);  
170
171                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
172                         convert(temp, minLength); 
173                         
174                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
175                         convert(temp, maxLength);
176                         
177                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
178                         convert(temp, bdiffs);
179                         
180                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
181                         convert(temp, pdiffs);
182                         
183                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
184                         convert(temp, tdiffs);
185                         
186                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
187                         
188                         temp = validParameter.validFile(parameters, "qfile", true);     
189                         if (temp == "not found")        {       qFileName = "";         }
190                         else if(temp == "not open")     {       abort = true;           }
191                         else                                            {       qFileName = temp;       }
192                         
193                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
194                         convert(temp, qThreshold);
195                         
196                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
197                         qtrim = m->isTrue(temp);
198
199                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
200                         convert(temp, qRollAverage);
201
202                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
203                         convert(temp, qWindowAverage);
204
205                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
206                         convert(temp, qWindowSize);
207
208                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
209                         convert(temp, qWindowStep);
210
211                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
212                         convert(temp, qAverage);
213
214                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
215                         convert(temp, keepFirst);
216
217                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
218                         convert(temp, removeLast);
219                         
220                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
221                         allFiles = m->isTrue(temp);
222                         
223                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
224                         convert(temp, processors); 
225                         
226                         if ((oligoFile != "") && (groupfile != "")) {
227                                 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
228                         }
229                                                                                                 
230                         
231                         if(allFiles && (oligoFile == "") && (groupfile == "")){
232                                 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file.  Ignoring the allfiles request."); m->mothurOutEndLine();
233                         }
234                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
235                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
236                                 qAverage=0;
237                                 qThreshold=0;
238                         }
239                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
240                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
241                                 abort = true;
242                         }
243                 }
244
245         }
246         catch(exception& e) {
247                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
248                 exit(1);
249         }
250 }
251
252 //**********************************************************************************************************************
253
254 void TrimSeqsCommand::help(){
255         try {
256                 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
257                 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
258                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
259                 m->mothurOut("The fasta parameter is required.\n");
260                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
261                 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
262                 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
263                 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
264                 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
265                 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
266                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
267                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
268                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
269                 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
270                 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
271                 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
272                 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
273                 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
274                 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
275                 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
276                 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
277                 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
278                 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
279                 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
280                 m->mothurOut("The trim.seqs command should be in the following format: \n");
281                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
282                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
283                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
284                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
285                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
286
287         }
288         catch(exception& e) {
289                 m->errorOut(e, "TrimSeqsCommand", "help");
290                 exit(1);
291         }
292 }
293
294
295 //***************************************************************************************************************
296
297 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
298
299 //***************************************************************************************************************
300
301 int TrimSeqsCommand::execute(){
302         try{
303         
304                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
305                 
306                 numFPrimers = 0;  //this needs to be initialized
307                 numRPrimers = 0;
308                 vector<vector<string> > fastaFileNames;
309                 vector<vector<string> > qualFileNames;
310                 
311                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
312                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
313                 
314                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
315                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
316                 
317                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
318                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
319                 if (qFileName != "") {
320                         outputNames.push_back(trimQualFile);
321                         outputNames.push_back(scrapQualFile);
322                         outputTypes["qual"].push_back(trimQualFile);
323                         outputTypes["qual"].push_back(scrapQualFile); 
324                 }
325                 
326                 string outputGroupFileName;
327
328                 if(oligoFile != ""){
329                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
330                         outputNames.push_back(outputGroupFileName); outputTypes["groups"].push_back(outputGroupFileName);
331                         getOligos(fastaFileNames, qualFileNames);
332                 }
333
334                 vector<unsigned long int> fastaFilePos;
335                 vector<unsigned long int> qFilePos;
336                 
337                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
338                 
339                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
340                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
341                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
342                 }       
343                 if(qFileName == "")     {       qLines = lines; } //files with duds
344                 
345                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
346                                 if(processors == 1){
347                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
348                                 }else{
349                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
350                                 }       
351                 #else
352                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
353                 #endif
354                 
355                 if (m->control_pressed) {  return 0; }                  
356                         
357                                 
358                 if(allFiles){
359                         for(int i=0;i<fastaFileNames.size();i++){
360                                 for(int j=0;j<fastaFileNames[0].size();j++){
361                                         if(m->isBlank(fastaFileNames[i][j])){
362                                                 remove(fastaFileNames[i][j].c_str());
363
364                                                 if(qFileName != ""){
365                                                         remove(fastaFileNames[i][j].c_str());
366                                                 }
367
368                                         }
369                                 }
370                         }
371                 }
372                 
373                 
374                 
375                 if (m->control_pressed) { 
376                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
377                         return 0;
378                 }
379
380                 m->mothurOutEndLine();
381                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
383                 m->mothurOutEndLine();
384                 
385                 return 0;       
386                         
387         }
388         catch(exception& e) {
389                 m->errorOut(e, "TrimSeqsCommand", "execute");
390                 exit(1);
391         }
392 }
393                 
394 /**************************************************************************************/
395
396 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
397                 
398         try {
399                 
400                 ofstream trimFASTAFile;
401                 m->openOutputFile(trimFileName, trimFASTAFile);
402                 
403                 ofstream scrapFASTAFile;
404                 m->openOutputFile(scrapFileName, scrapFASTAFile);
405                 
406                 ofstream trimQualFile;
407                 ofstream scrapQualFile;
408                 if(qFileName != ""){
409                         m->openOutputFile(trimQFileName, trimQualFile);
410                         m->openOutputFile(scrapQFileName, scrapQualFile);
411                 }
412                 
413                 ofstream outGroupsFile;
414                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
415                 
416                 if(allFiles){
417                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
418                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
419                                         ofstream temp;
420                                         m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
421                                         if(qFileName != ""){
422                                                 m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
423                                         }
424                                 }
425                         }
426                 }
427                 
428                 ifstream inFASTA;
429                 m->openInputFile(filename, inFASTA);
430                 inFASTA.seekg(line->start);
431                 
432                 ifstream qFile;
433                 if(qFileName != "")     {
434                         m->openInputFile(qFileName, qFile);
435                         qFile.seekg(qline->start);  
436                 }
437                 
438                 int count = 0;
439                 bool moreSeqs = 1;
440         
441                 while (moreSeqs) {
442                                 
443                         if (m->control_pressed) { 
444                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
445                                 if (oligoFile != "") {   outGroupsFile.close();   }
446
447                                 if(qFileName != ""){
448                                         qFile.close();
449                                 }
450                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
451
452                                 return 0;
453                         }
454                         
455                         int success = 1;
456                         string trashCode = "";
457                         int currentSeqsDiffs = 0;
458
459                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
460
461                         QualityScores currQual;
462                         if(qFileName != ""){
463                                 currQual = QualityScores(qFile);  m->gobble(qFile);
464                         }
465
466                         string origSeq = currSeq.getUnaligned();
467                         if (origSeq != "") {
468                                 
469                                 int barcodeIndex = 0;
470                                 int primerIndex = 0;
471                                 
472                                 if(barcodes.size() != 0){
473                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
474                                         if(success > bdiffs)            {       trashCode += 'b';       }
475                                         else{ currentSeqsDiffs += success;  }
476                                 }
477                                 
478                                 if(numFPrimers != 0){
479                                         success = stripForward(currSeq, currQual, primerIndex);
480                                         if(success > pdiffs)            {       trashCode += 'f';       }
481                                         else{ currentSeqsDiffs += success;  }
482                                 }
483                                 
484                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
485                                 
486                                 if(numRPrimers != 0){
487                                         success = stripReverse(currSeq, currQual);
488                                         if(!success)                            {       trashCode += 'r';       }
489                                 }
490
491                                 if(keepFirst != 0){
492                                         success = keepFirstTrim(currSeq, currQual);
493                                 }
494                                 
495                                 if(removeLast != 0){
496                                         success = removeLastTrim(currSeq, currQual);
497                                         if(!success)                            {       trashCode += 'l';       }
498                                 }
499
500                                 
501                                 if(qFileName != ""){
502                                         int origLength = currSeq.getNumBases();
503                                         
504                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
505                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
506                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
507                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
508                                         else                                            {       success = 1;                            }
509                                         
510                                         //you don't want to trim, if it fails above then scrap it
511                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
512                                         
513                                         if(!success)                            {       trashCode += 'q';       }
514                                 }                               
515                 
516                                 if(minLength > 0 || maxLength > 0){
517                                         success = cullLength(currSeq);
518                                         if(!success)                            {       trashCode += 'l';       }
519                                 }
520                                 if(maxHomoP > 0){
521                                         success = cullHomoP(currSeq);
522                                         if(!success)                            {       trashCode += 'h';       }
523                                 }
524                                 if(maxAmbig != -1){
525                                         success = cullAmbigs(currSeq);
526                                         if(!success)                            {       trashCode += 'n';       }
527                                 }
528                                 
529                                 if(flip){               // should go last                       
530                                         currSeq.reverseComplement();
531                                         if(qFileName != ""){
532                                                 currQual.flipQScores(); 
533                                         }
534                                 }
535                                 
536                                 if(trashCode.length() == 0){
537                                         currSeq.setAligned(currSeq.getUnaligned());
538                                         currSeq.printSequence(trimFASTAFile);
539                                         
540                                         if(qFileName != ""){
541                                                 currQual.printQScores(trimQualFile);
542                                         }
543                                         
544                                         if(barcodes.size() != 0){
545                                                 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
546                                         }
547                                         
548                                         
549                                         if(allFiles){
550                                                 ofstream output;
551                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
552                                                 currSeq.printSequence(output);
553                                                 output.close();
554                                                 
555                                                 if(qFileName != ""){
556                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
557                                                         currQual.printQScores(output);
558                                                         output.close();                                                 
559                                                 }
560                                         }
561                                 }
562                                 else{
563                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
564                                         currSeq.setUnaligned(origSeq);
565                                         currSeq.setAligned(origSeq);
566                                         currSeq.printSequence(scrapFASTAFile);
567                                         if(qFileName != ""){
568                                                 currQual.printQScores(scrapQualFile);
569                                         }
570                                 }
571                                 count++;
572                         }
573                         
574                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
575                                 unsigned long int pos = inFASTA.tellg();
576                                 if ((pos == -1) || (pos >= line->end)) { break; }
577                         #else
578                                 if (inFASTA.eof()) { break; }
579                         #endif
580                                 
581                         //report progress
582                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
583                         
584                 }
585                 //report progress
586                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
587
588                 
589                 inFASTA.close();
590                 trimFASTAFile.close();
591                 scrapFASTAFile.close();
592                 if (oligoFile != "") {   outGroupsFile.close();   }
593                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
594                 
595                 return count;
596         }
597         catch(exception& e) {
598                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
599                 exit(1);
600         }
601 }
602
603 /**************************************************************************************************/
604
605 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
606         try {
607 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
608                 int process = 1;
609                 int exitCommand = 1;
610                 processIDS.clear();
611                 
612                 //loop through and create all the processes you want
613                 while (process != processors) {
614                         int pid = fork();
615                         
616                         if (pid > 0) {
617                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
618                                 process++;
619                         }else if (pid == 0){
620                                 
621                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
622                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
623
624                                 if(allFiles){
625                                         ofstream temp;
626
627                                         for(int i=0;i<tempFASTAFileNames.size();i++){
628                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
629                                                         tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
630                                                         m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
631
632                                                         if(qFileName != ""){
633                                                                 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
634                                                                 m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
635                                                         }
636                                                 }
637                                         }
638                                 }
639                                                         
640                                 driverCreateTrim(filename,
641                                                                  qFileName,
642                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
643                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
644                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
645                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
646                                                                  (groupFile + toString(getpid()) + ".temp"),
647                                                                  tempFASTAFileNames,
648                                                                  tempPrimerQualFileNames,
649                                                                  lines[process],
650                                                                  qLines[process]);
651                                 
652                                 exit(0);
653                         }else { 
654                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
655                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
656                                 exit(0);
657                         }
658                 }
659                 
660                 //parent do my part
661                 ofstream temp;
662                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
663                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
664                 m->openOutputFile(trimQualFileName, temp);              temp.close();
665                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
666
667                 
668                 
669                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
670                 
671                 
672                 //force parent to wait until all the processes are done
673                 for (int i=0;i<processIDS.size();i++) { 
674                         int temp = processIDS[i];
675                         wait(&temp);
676                 }
677                 
678                 //append files
679                 for(int i=0;i<processIDS.size();i++){
680                         
681                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
682                         
683                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
684                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
685                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
686                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
687                         
688                         if(qFileName != ""){
689                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
690                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
691                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
692                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
693                         }
694                         
695                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
696                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
697                         
698                         
699                         if(allFiles){
700                                 for(int j=0;j<fastaFileNames.size();j++){
701                                         for(int k=0;k<fastaFileNames[j].size();k++){
702                                                 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
703                                                 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
704                                                 
705                                                 if(qFileName != ""){
706                                                         m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
707                                                         remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
708                                                 }
709                                         }
710                                 }
711                         }
712                         
713                 }
714         
715                 return exitCommand;
716 #endif          
717         }
718         catch(exception& e) {
719                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
720                 exit(1);
721         }
722 }
723
724 /**************************************************************************************************/
725
726 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
727         try {
728                 
729                 //set file positions for fasta file
730                 fastaFilePos = m->divideFile(filename, processors);
731                 
732                 if (qfilename == "") { return processors; }
733                 
734                 //get name of first sequence in each chunk
735                 map<string, int> firstSeqNames;
736                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
737                         ifstream in;
738                         m->openInputFile(filename, in);
739                         in.seekg(fastaFilePos[i]);
740                 
741                         Sequence temp(in); 
742                         firstSeqNames[temp.getName()] = i;
743                 
744                         in.close();
745                 }
746                                 
747                 //seach for filePos of each first name in the qfile and save in qfileFilePos
748                 ifstream inQual;
749                 m->openInputFile(qfilename, inQual);
750                 
751                 string input;
752                 while(!inQual.eof()){   
753                         input = m->getline(inQual);
754
755                         if (input.length() != 0) {
756                                 if(input[0] == '>'){ //this is a sequence name line
757                                         istringstream nameStream(input);
758                                         
759                                         string sname = "";  nameStream >> sname;
760                                         sname = sname.substr(1);
761                                         
762                                         map<string, int>::iterator it = firstSeqNames.find(sname);
763                                         
764                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
765                                                 unsigned long int pos = inQual.tellg(); 
766                                                 qfileFilePos.push_back(pos - input.length() - 1);       
767                                                 firstSeqNames.erase(it);
768                                         }
769                                 }
770                         }
771                         
772                         if (firstSeqNames.size() == 0) { break; }
773                 }
774                 inQual.close();
775                 
776                 
777                 if (firstSeqNames.size() != 0) { 
778                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
779                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
780                         }
781                         qFileName = "";
782                         return processors;
783                 }
784
785                 //get last file position of qfile
786                 FILE * pFile;
787                 unsigned long int size;
788                 
789                 //get num bytes in file
790                 pFile = fopen (qfilename.c_str(),"rb");
791                 if (pFile==NULL) perror ("Error opening file");
792                 else{
793                         fseek (pFile, 0, SEEK_END);
794                         size=ftell (pFile);
795                         fclose (pFile);
796                 }
797                 
798                 qfileFilePos.push_back(size);
799                 
800                 return processors;
801         }
802         catch(exception& e) {
803                 m->errorOut(e, "TrimSeqsCommand", "setLines");
804                 exit(1);
805         }
806 }
807
808 //***************************************************************************************************************
809
810 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
811         try {
812                 ifstream inOligos;
813                 m->openInputFile(oligoFile, inOligos);
814                 
815                 ofstream test;
816                 
817                 string type, oligo, group;
818
819                 int indexPrimer = 0;
820                 int indexBarcode = 0;
821                 
822                 while(!inOligos.eof()){
823
824                         inOligos >> type; m->gobble(inOligos);
825                                         
826                         if(type[0] == '#'){
827                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
828                         }
829                         else{
830                                 //make type case insensitive
831                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
832                                 
833                                 inOligos >> oligo;
834                                 
835                                 for(int i=0;i<oligo.length();i++){
836                                         oligo[i] = toupper(oligo[i]);
837                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
838                                 }
839                                 
840                                 if(type == "FORWARD"){
841                                         group = "";
842                                         
843                                         // get rest of line in case there is a primer name
844                                         while (!inOligos.eof()) {       
845                                                 char c = inOligos.get(); 
846                                                 if (c == 10 || c == 13){        break;  }
847                                                 else if (c == 32 || c == 9){;} //space or tab
848                                                 else {  group += c;  }
849                                         } 
850                                         
851                                         //check for repeat barcodes
852                                         map<string, int>::iterator itPrime = primers.find(oligo);
853                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
854                                         
855                                         primers[oligo]=indexPrimer; indexPrimer++;              
856                                         primerNameVector.push_back(group);
857                                 }
858                                 else if(type == "REVERSE"){
859                                         Sequence oligoRC("reverse", oligo);
860                                         oligoRC.reverseComplement();
861                                         revPrimer.push_back(oligoRC.getUnaligned());
862                                 }
863                                 else if(type == "BARCODE"){
864                                         inOligos >> group;
865                                         
866                                         //check for repeat barcodes
867                                         map<string, int>::iterator itBar = barcodes.find(oligo);
868                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
869                                                 
870                                         barcodes[oligo]=indexBarcode; indexBarcode++;
871                                         barcodeNameVector.push_back(group);
872                                 }
873                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
874                         }
875                         m->gobble(inOligos);
876                 }       
877                 inOligos.close();
878                 
879                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
880                 
881                 //add in potential combos
882                 if(barcodeNameVector.size() == 0){
883                         barcodes[""] = 0;
884                         barcodeNameVector.push_back("");                        
885                 }
886                 
887                 if(primerNameVector.size() == 0){
888                         primers[""] = 0;
889                         primerNameVector.push_back("");                 
890                 }
891                 
892                 fastaFileNames.resize(barcodeNameVector.size());
893                 for(int i=0;i<fastaFileNames.size();i++){
894                         fastaFileNames[i].assign(primerNameVector.size(), "");
895                 }
896                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
897                 
898                 if(allFiles){
899                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
900                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
901                                         
902                                         string primerName = primerNameVector[itPrimer->second];
903                                         string barcodeName = barcodeNameVector[itBar->second];
904                                         
905                                         string comboGroupName = "";
906                                         string fastaFileName = "";
907                                         string qualFileName = "";
908                                         
909                                         if(primerName == ""){
910                                                 comboGroupName = barcodeNameVector[itBar->second];
911                                         }
912                                         else{
913                                                 if(barcodeName == ""){
914                                                         comboGroupName = primerNameVector[itPrimer->second];
915                                                 }
916                                                 else{
917                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
918                                                 }
919                                         }
920
921                                         ofstream temp;
922                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
923                                         outputNames.push_back(fastaFileName);
924                                         outputTypes["fasta"].push_back(fastaFileName);
925                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
926                                         m->openOutputFile(fastaFileName, temp);         temp.close();
927
928                                         if(qFileName != ""){
929                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
930                                                 outputNames.push_back(qualFileName);
931                                                 outputTypes["qual"].push_back(qualFileName);
932                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
933                                                 m->openOutputFile(qualFileName, temp);          temp.close();
934                                         }
935                                 }
936                         }
937                 }
938                 numFPrimers = primers.size();
939                 numRPrimers = revPrimer.size();
940
941         }
942         catch(exception& e) {
943                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
944                 exit(1);
945         }
946 }
947
948 //***************************************************************************************************************
949
950 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
951         try {
952                 
953                 string rawSequence = seq.getUnaligned();
954                 int success = bdiffs + 1;       //guilty until proven innocent
955                 
956                 //can you find the barcode
957                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
958                         string oligo = it->first;
959                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
960                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
961                                 break;  
962                         }
963                         
964                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
965                                 group = it->second;
966                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
967                                 
968                                 if(qual.getName() != ""){
969                                         qual.trimQScores(oligo.length(), -1);
970                                 }
971                                 
972                                 success = 0;
973                                 break;
974                         }
975                 }
976                 
977                 //if you found the barcode or if you don't want to allow for diffs
978                 if ((bdiffs == 0) || (success == 0)) { return success;  }
979                 
980                 else { //try aligning and see if you can find it
981
982                         int maxLength = 0;
983
984                         Alignment* alignment;
985                         if (barcodes.size() > 0) {
986                                 map<string,int>::iterator it=barcodes.begin();
987
988                                 for(it;it!=barcodes.end();it++){
989                                         if(it->first.length() > maxLength){
990                                                 maxLength = it->first.length();
991                                         }
992                                 }
993                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
994
995                         }else{ alignment = NULL; } 
996                         
997                         //can you find the barcode
998                         int minDiff = 1e6;
999                         int minCount = 1;
1000                         int minGroup = -1;
1001                         int minPos = 0;
1002                         
1003                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1004                                 string oligo = it->first;
1005 //                              int length = oligo.length();
1006                                 
1007                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1008                                         success = bdiffs + 10;
1009                                         break;
1010                                 }
1011                                 
1012                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1013                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1014                                 oligo = alignment->getSeqAAln();
1015                                 string temp = alignment->getSeqBAln();
1016                 
1017                                 int alnLength = oligo.length();
1018                                 
1019                                 for(int i=oligo.length()-1;i>=0;i--){
1020                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1021                                 }
1022                                 oligo = oligo.substr(0,alnLength);
1023                                 temp = temp.substr(0,alnLength);
1024                                 
1025                                 int numDiff = countDiffs(oligo, temp);
1026                                 
1027                                 if(numDiff < minDiff){
1028                                         minDiff = numDiff;
1029                                         minCount = 1;
1030                                         minGroup = it->second;
1031                                         minPos = 0;
1032                                         for(int i=0;i<alnLength;i++){
1033                                                 if(temp[i] != '-'){
1034                                                         minPos++;
1035                                                 }
1036                                         }
1037                                 }
1038                                 else if(numDiff == minDiff){
1039                                         minCount++;
1040                                 }
1041
1042                         }
1043
1044                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1045                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1046                         else{                                                                                                   //use the best match
1047                                 group = minGroup;
1048                                 seq.setUnaligned(rawSequence.substr(minPos));
1049                                 
1050                                 if(qual.getName() != ""){
1051                                         qual.trimQScores(minPos, -1);
1052                                 }
1053                                 success = minDiff;
1054                         }
1055                         
1056                         if (alignment != NULL) {  delete alignment;  }
1057                         
1058                 }
1059                 
1060                 return success;
1061                 
1062         }
1063         catch(exception& e) {
1064                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1065                 exit(1);
1066         }
1067
1068 }
1069
1070 //***************************************************************************************************************
1071
1072 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1073         try {
1074                 string rawSequence = seq.getUnaligned();
1075                 int success = pdiffs + 1;       //guilty until proven innocent
1076                 
1077                 //can you find the primer
1078                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1079                         string oligo = it->first;
1080                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1081                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1082                                 break;  
1083                         }
1084                         
1085                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1086                                 group = it->second;
1087                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1088                                 if(qual.getName() != ""){
1089                                         qual.trimQScores(oligo.length(), -1);
1090                                 }
1091                                 success = 0;
1092                                 break;
1093                         }
1094                 }
1095
1096                 //if you found the barcode or if you don't want to allow for diffs
1097                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1098                 
1099                 else { //try aligning and see if you can find it
1100
1101                         int maxLength = 0;
1102
1103                         Alignment* alignment;
1104                         if (primers.size() > 0) {
1105                                 map<string,int>::iterator it=primers.begin();
1106
1107                                 for(it;it!=primers.end();it++){
1108                                         if(it->first.length() > maxLength){
1109                                                 maxLength = it->first.length();
1110                                         }
1111                                 }
1112                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1113
1114                         }else{ alignment = NULL; } 
1115                         
1116                         //can you find the barcode
1117                         int minDiff = 1e6;
1118                         int minCount = 1;
1119                         int minGroup = -1;
1120                         int minPos = 0;
1121                         
1122                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1123                                 string oligo = it->first;
1124 //                              int length = oligo.length();
1125                                 
1126                                 if(rawSequence.length() < maxLength){   
1127                                         success = pdiffs + 100;
1128                                         break;
1129                                 }
1130                                 
1131                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1132                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1133                                 oligo = alignment->getSeqAAln();
1134                                 string temp = alignment->getSeqBAln();
1135                 
1136                                 int alnLength = oligo.length();
1137                                 
1138                                 for(int i=oligo.length()-1;i>=0;i--){
1139                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1140                                 }
1141                                 oligo = oligo.substr(0,alnLength);
1142                                 temp = temp.substr(0,alnLength);
1143                                 
1144                                 int numDiff = countDiffs(oligo, temp);
1145                                 
1146                                 if(numDiff < minDiff){
1147                                         minDiff = numDiff;
1148                                         minCount = 1;
1149                                         minGroup = it->second;
1150                                         minPos = 0;
1151                                         for(int i=0;i<alnLength;i++){
1152                                                 if(temp[i] != '-'){
1153                                                         minPos++;
1154                                                 }
1155                                         }
1156                                 }
1157                                 else if(numDiff == minDiff){
1158                                         minCount++;
1159                                 }
1160
1161                         }
1162
1163                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1164                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1165                         else{                                                                                                   //use the best match
1166                                 group = minGroup;
1167                                 seq.setUnaligned(rawSequence.substr(minPos));
1168                                 if(qual.getName() != ""){
1169                                         qual.trimQScores(minPos, -1);
1170                                 }
1171                                 success = minDiff;
1172                         }
1173                         
1174                         if (alignment != NULL) {  delete alignment;  }
1175                         
1176                 }
1177                 
1178                 return success;
1179
1180         }
1181         catch(exception& e) {
1182                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1183                 exit(1);
1184         }
1185 }
1186
1187 //***************************************************************************************************************
1188
1189 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1190         try {
1191                 string rawSequence = seq.getUnaligned();
1192                 bool success = 0;       //guilty until proven innocent
1193                 
1194                 for(int i=0;i<numRPrimers;i++){
1195                         string oligo = revPrimer[i];
1196                         
1197                         if(rawSequence.length() < oligo.length()){
1198                                 success = 0;
1199                                 break;
1200                         }
1201                         
1202                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1203                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1204                                 if(qual.getName() != ""){
1205                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1206                                 }
1207                                 success = 1;
1208                                 break;
1209                         }
1210                 }       
1211                 return success;
1212                 
1213         }
1214         catch(exception& e) {
1215                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1216                 exit(1);
1217         }
1218 }
1219
1220 //***************************************************************************************************************
1221
1222 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1223         try {
1224                 bool success = 1;
1225                 if(qscores.getName() != ""){
1226                         qscores.trimQScores(-1, keepFirst);
1227                 }
1228                 sequence.trim(keepFirst);
1229                 return success;
1230         }
1231         catch(exception& e) {
1232                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1233                 exit(1);
1234         }
1235         
1236 }       
1237
1238 //***************************************************************************************************************
1239
1240 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1241         try {
1242                 bool success = 0;
1243                 
1244                 int length = sequence.getNumBases() - removeLast;
1245                 
1246                 if(length > 0){
1247                         if(qscores.getName() != ""){
1248                                 qscores.trimQScores(-1, length);
1249                         }
1250                         sequence.trim(length);
1251                         success = 1;
1252                 }
1253                 else{
1254                         success = 0;
1255                 }
1256
1257                 return success;
1258         }
1259         catch(exception& e) {
1260                 m->errorOut(e, "removeLastTrim", "countDiffs");
1261                 exit(1);
1262         }
1263         
1264 }       
1265
1266 //***************************************************************************************************************
1267
1268 bool TrimSeqsCommand::cullLength(Sequence& seq){
1269         try {
1270         
1271                 int length = seq.getNumBases();
1272                 bool success = 0;       //guilty until proven innocent
1273                 
1274                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1275                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1276                 else                                                                                            {       success = 0;    }
1277                 
1278                 return success;
1279         
1280         }
1281         catch(exception& e) {
1282                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1283                 exit(1);
1284         }
1285         
1286 }
1287
1288 //***************************************************************************************************************
1289
1290 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1291         try {
1292                 int longHomoP = seq.getLongHomoPolymer();
1293                 bool success = 0;       //guilty until proven innocent
1294                 
1295                 if(longHomoP <= maxHomoP){      success = 1;    }
1296                 else                                    {       success = 0;    }
1297                 
1298                 return success;
1299         }
1300         catch(exception& e) {
1301                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1302                 exit(1);
1303         }
1304         
1305 }
1306
1307 //***************************************************************************************************************
1308
1309 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1310         try {
1311                 int numNs = seq.getAmbigBases();
1312                 bool success = 0;       //guilty until proven innocent
1313                 
1314                 if(numNs <= maxAmbig)   {       success = 1;    }
1315                 else                                    {       success = 0;    }
1316                 
1317                 return success;
1318         }
1319         catch(exception& e) {
1320                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1321                 exit(1);
1322         }
1323         
1324 }
1325
1326 //***************************************************************************************************************
1327
1328 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1329         try {
1330                 bool success = 1;
1331                 int length = oligo.length();
1332                 
1333                 for(int i=0;i<length;i++){
1334                         
1335                         if(oligo[i] != seq[i]){
1336                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1337                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1338                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1339                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1340                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1341                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1342                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1343                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1344                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1345                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1346                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1347                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1348                                 
1349                                 if(success == 0)        {       break;   }
1350                         }
1351                         else{
1352                                 success = 1;
1353                         }
1354                 }
1355                 
1356                 return success;
1357         }
1358         catch(exception& e) {
1359                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1360                 exit(1);
1361         }
1362
1363 }
1364
1365 //***************************************************************************************************************
1366
1367 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1368         try {
1369
1370                 int length = oligo.length();
1371                 int countDiffs = 0;
1372                 
1373                 for(int i=0;i<length;i++){
1374                                                                 
1375                         if(oligo[i] != seq[i]){
1376                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1377                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1378                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1379                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1380                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1381                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1382                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1383                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1384                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1385                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1386                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1387                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1388                         }
1389                         
1390                 }
1391                 
1392                 return countDiffs;
1393         }
1394         catch(exception& e) {
1395                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1396                 exit(1);
1397         }
1398
1399 }
1400
1401 //***************************************************************************************************************