]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
worked on trim.seqs - added in the groupfiles for allfiles=t, cleaned up the outputNa...
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimSeqsCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", 
18                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19                                                                         "keepfirst", "removelast",
20                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29
30 //**********************************************************************************************************************
31
32 TrimSeqsCommand::TrimSeqsCommand(){     
33         try {
34                 abort = true; calledHelp = true; 
35                 vector<string> tempOutNames;
36                 outputTypes["fasta"] = tempOutNames;
37                 outputTypes["qfile"] = tempOutNames;
38                 outputTypes["group"] = tempOutNames;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
42                 exit(1);
43         }
44 }
45
46 //**********************************************************************************************************************
47
48 vector<string> TrimSeqsCommand::getRequiredParameters(){        
49         try {
50                 string Array[] =  {"fasta"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59
60 //**********************************************************************************************************************
61
62 vector<string> TrimSeqsCommand::getRequiredFiles(){     
63         try {
64                 vector<string> myArray;
65                 return myArray;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
69                 exit(1);
70         }
71 }
72
73 //***************************************************************************************************************
74
75 TrimSeqsCommand::TrimSeqsCommand(string option)  {
76         try {
77                 
78                 abort = false; calledHelp = false;   
79                 comboStarts = 0;
80                 
81                 //allow user to run help
82                 if(option == "help") { help(); abort = true; calledHelp = true; }
83                 
84                 else {
85                         //valid paramters for this command
86                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
87                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88                                                                 "keepfirst", "removelast",
89                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
90                         
91                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
92                         
93                         OptionParser parser(option);
94                         map<string,string> parameters = parser.getParameters();
95                         
96                         ValidParameters validParameter;
97                         map<string,string>::iterator it;
98                         
99                         //check to make sure all parameters are valid for command
100                         for (it = parameters.begin(); it != parameters.end(); it++) { 
101                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
102                         }
103                         
104                         //initialize outputTypes
105                         vector<string> tempOutNames;
106                         outputTypes["fasta"] = tempOutNames;
107                         outputTypes["qfile"] = tempOutNames;
108                         outputTypes["group"] = tempOutNames;
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fasta");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("oligos");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
129                                 }
130                                 
131                                 it = parameters.find("qfile");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
137                                 }
138                                 
139                         }
140
141                         
142                         //check for required parameters
143                         fastaFile = validParameter.validFile(parameters, "fasta", true);
144                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145                         else if (fastaFile == "not open") { abort = true; }     
146                         
147                         //if the user changes the output directory command factory will send this info to us in the output parameter 
148                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
149                                 outputDir = ""; 
150                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
151                         }
152                 
153                         
154                         //check for optional parameter and set defaults
155                         // ...at some point should added some additional type checking...
156                         string temp;
157                         temp = validParameter.validFile(parameters, "flip", false);
158                         if (temp == "not found"){       flip = 0;       }
159                         else if(m->isTrue(temp))        {       flip = 1;       }
160                 
161                         temp = validParameter.validFile(parameters, "oligos", true);
162                         if (temp == "not found"){       oligoFile = "";         }
163                         else if(temp == "not open"){    abort = true;   } 
164                         else                                    {       oligoFile = temp;               }
165                         
166                         
167                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
168                         convert(temp, maxAmbig);  
169
170                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
171                         convert(temp, maxHomoP);  
172
173                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
174                         convert(temp, minLength); 
175                         
176                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
177                         convert(temp, maxLength);
178                         
179                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
180                         convert(temp, bdiffs);
181                         
182                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
183                         convert(temp, pdiffs);
184                         
185                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
186                         convert(temp, tdiffs);
187                         
188                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
189                         
190                         temp = validParameter.validFile(parameters, "qfile", true);     
191                         if (temp == "not found")        {       qFileName = "";         }
192                         else if(temp == "not open")     {       abort = true;           }
193                         else                                            {       qFileName = temp;       }
194                         
195                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
196                         convert(temp, qThreshold);
197                         
198                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
199                         qtrim = m->isTrue(temp);
200
201                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
202                         convert(temp, qRollAverage);
203
204                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205                         convert(temp, qWindowAverage);
206
207                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
208                         convert(temp, qWindowSize);
209
210                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
211                         convert(temp, qWindowStep);
212
213                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
214                         convert(temp, qAverage);
215
216                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
217                         convert(temp, keepFirst);
218
219                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
220                         convert(temp, removeLast);
221                         
222                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
223                         allFiles = m->isTrue(temp);
224                         
225                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
226                         convert(temp, processors); 
227                         
228                         
229                         if(allFiles && (oligoFile == "")){
230                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
231                         }
232                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
234                                 qAverage=0;
235                                 qThreshold=0;
236                         }
237                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
238                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
239                                 abort = true;
240                         }
241                 }
242
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
246                 exit(1);
247         }
248 }
249
250 //**********************************************************************************************************************
251
252 void TrimSeqsCommand::help(){
253         try {
254                 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255                 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257                 m->mothurOut("The fasta parameter is required.\n");
258                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259                 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260                 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261                 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262                 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263                 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267                 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268                 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269                 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270                 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271                 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272                 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273                 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274                 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275                 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276                 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277                 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278                 m->mothurOut("The trim.seqs command should be in the following format: \n");
279                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
280                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
281                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
284
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "TrimSeqsCommand", "help");
288                 exit(1);
289         }
290 }
291
292
293 //***************************************************************************************************************
294
295 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
296
297 //***************************************************************************************************************
298
299 int TrimSeqsCommand::execute(){
300         try{
301         
302                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
303                 
304                 numFPrimers = 0;  //this needs to be initialized
305                 numRPrimers = 0;
306                 vector<vector<string> > fastaFileNames;
307                 vector<vector<string> > qualFileNames;
308                 
309                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
311                 
312                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
314                 
315                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317                 if (qFileName != "") {
318                         outputNames.push_back(trimQualFile);
319                         outputNames.push_back(scrapQualFile);
320                         outputTypes["qfile"].push_back(trimQualFile);
321                         outputTypes["qfile"].push_back(scrapQualFile); 
322                 }
323                 
324                 string outputGroupFileName;
325                 if(oligoFile != ""){
326                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328                         getOligos(fastaFileNames, qualFileNames);
329                 }
330
331                 vector<unsigned long int> fastaFilePos;
332                 vector<unsigned long int> qFilePos;
333                 
334                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
335                 
336                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
339                 }       
340                 if(qFileName == "")     {       qLines = lines; } //files with duds
341                 
342                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
343                                 if(processors == 1){
344                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
345                                 }else{
346                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
347                                 }       
348                 #else
349                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
350                 #endif
351                 
352                 if (m->control_pressed) {  return 0; }                  
353                         
354                 if(allFiles){
355                         //clear out all old group files
356                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
357                         map<string, string>::iterator it;
358                         set<string> namesToRemove;
359                         for(int i=0;i<fastaFileNames.size();i++){
360                                 for(int j=0;j<fastaFileNames[0].size();j++){
361                                         if (fastaFileNames[i][j] != "") {
362                                                 if(m->isBlank(fastaFileNames[i][j])){
363                                                         remove(fastaFileNames[i][j].c_str());
364                                                         namesToRemove.insert(fastaFileNames[i][j]);
365                                                         
366                                                         if(qFileName != ""){
367                                                                 remove(qualFileNames[i][j].c_str());
368                                                                 namesToRemove.insert(qualFileNames[i][j]);
369                                                         }
370                                                 }else{  
371                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
372                                                         if (it == uniqueFastaNames.end()) {     
373                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
374                                                         }       
375                                                 }
376                                         }
377                                 }
378                         }
379                         
380                         //remove names for outputFileNames, just cleans up the output
381                         vector<string> outputNames2;
382                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
383                         outputNames = outputNames2;
384                         
385                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
386                                 ifstream in;
387                                 m->openInputFile(it->first, in);
388                                 
389                                 ofstream out;
390                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
391                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
392                                 m->openOutputFile(thisGroupName, out);
393                                 
394                                 while (!in.eof()){
395                                         if (m->control_pressed) { break; }
396                                         
397                                         Sequence currSeq(in); m->gobble(in);
398                                         out << currSeq.getName() << '\t' << it->second << endl;
399                                 }
400                                 in.close();
401                                 out.close();
402                         }
403                 }
404                 
405                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
406
407                 //output group counts
408                 m->mothurOutEndLine();
409                 int total = 0;
410                 for (int i = 0; i < barcodeNameVector.size(); i++) {
411                         if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
412                 }
413                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
414                 
415                         if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
416
417                 m->mothurOutEndLine();
418                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
419                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
420                 m->mothurOutEndLine();
421                 
422                 return 0;       
423                         
424         }
425         catch(exception& e) {
426                 m->errorOut(e, "TrimSeqsCommand", "execute");
427                 exit(1);
428         }
429 }
430                 
431 /**************************************************************************************/
432
433 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
434                 
435         try {
436                 
437                 ofstream trimFASTAFile;
438                 m->openOutputFile(trimFileName, trimFASTAFile);
439                 
440                 ofstream scrapFASTAFile;
441                 m->openOutputFile(scrapFileName, scrapFASTAFile);
442                 
443                 ofstream trimQualFile;
444                 ofstream scrapQualFile;
445                 if(qFileName != ""){
446                         m->openOutputFile(trimQFileName, trimQualFile);
447                         m->openOutputFile(scrapQFileName, scrapQualFile);
448                 }
449                 
450                 ofstream outGroupsFile;
451                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
452                 if(allFiles){
453                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
454                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
455                                         if (fastaFileNames[i][j] != "") {
456                                                 ofstream temp;
457                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
458                                                 if(qFileName != ""){
459                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
460                                                 }
461                                         }
462                                 }
463                         }
464                 }
465                 
466                 ifstream inFASTA;
467                 m->openInputFile(filename, inFASTA);
468                 inFASTA.seekg(line->start);
469                 
470                 ifstream qFile;
471                 if(qFileName != "")     {
472                         m->openInputFile(qFileName, qFile);
473                         qFile.seekg(qline->start);  
474                 }
475                 
476                 int count = 0;
477                 bool moreSeqs = 1;
478         
479                 while (moreSeqs) {
480                                 
481                         if (m->control_pressed) { 
482                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
483                                 if (oligoFile != "") {   outGroupsFile.close();   }
484
485                                 if(qFileName != ""){
486                                         qFile.close();
487                                 }
488                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
489
490                                 return 0;
491                         }
492                         
493                         int success = 1;
494                         string trashCode = "";
495                         int currentSeqsDiffs = 0;
496
497                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
498
499                         QualityScores currQual;
500                         if(qFileName != ""){
501                                 currQual = QualityScores(qFile);  m->gobble(qFile);
502                         }
503
504                         string origSeq = currSeq.getUnaligned();
505                         if (origSeq != "") {
506                                 
507                                 int barcodeIndex = 0;
508                                 int primerIndex = 0;
509                                 
510                                 if(barcodes.size() != 0){
511                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
512                                         if(success > bdiffs)            {       trashCode += 'b';       }
513                                         else{ currentSeqsDiffs += success;  }
514                                 }
515                                 
516                                 if(numFPrimers != 0){
517                                         success = stripForward(currSeq, currQual, primerIndex);
518                                         if(success > pdiffs)            {       trashCode += 'f';       }
519                                         else{ currentSeqsDiffs += success;  }
520                                 }
521                                 
522                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
523                                 
524                                 if(numRPrimers != 0){
525                                         success = stripReverse(currSeq, currQual);
526                                         if(!success)                            {       trashCode += 'r';       }
527                                 }
528
529                                 if(keepFirst != 0){
530                                         success = keepFirstTrim(currSeq, currQual);
531                                 }
532                                 
533                                 if(removeLast != 0){
534                                         success = removeLastTrim(currSeq, currQual);
535                                         if(!success)                            {       trashCode += 'l';       }
536                                 }
537
538                                 
539                                 if(qFileName != ""){
540                                         int origLength = currSeq.getNumBases();
541                                         
542                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
543                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
544                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
545                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
546                                         else                                            {       success = 1;                            }
547                                         
548                                         //you don't want to trim, if it fails above then scrap it
549                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
550                                         
551                                         if(!success)                            {       trashCode += 'q';       }
552                                 }                               
553                 
554                                 if(minLength > 0 || maxLength > 0){
555                                         success = cullLength(currSeq);
556                                         if(!success)                            {       trashCode += 'l';       }
557                                 }
558                                 if(maxHomoP > 0){
559                                         success = cullHomoP(currSeq);
560                                         if(!success)                            {       trashCode += 'h';       }
561                                 }
562                                 if(maxAmbig != -1){
563                                         success = cullAmbigs(currSeq);
564                                         if(!success)                            {       trashCode += 'n';       }
565                                 }
566                                 
567                                 if(flip){               // should go last                       
568                                         currSeq.reverseComplement();
569                                         if(qFileName != ""){
570                                                 currQual.flipQScores(); 
571                                         }
572                                 }
573                                 
574                                 if(trashCode.length() == 0){
575                                         currSeq.setAligned(currSeq.getUnaligned());
576                                         currSeq.printSequence(trimFASTAFile);
577                                         
578                                         if(qFileName != ""){
579                                                 currQual.printQScores(trimQualFile);
580                                         }
581                                         
582                                         if(barcodes.size() != 0){
583                                                 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
584                                                 groupCounts[barcodeIndex]++;
585                                         }
586                                         
587                                         
588                                         if(allFiles){
589                                                 ofstream output;
590                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
591                                                 currSeq.printSequence(output);
592                                                 output.close();
593                                                 
594                                                 if(qFileName != ""){
595                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
596                                                         currQual.printQScores(output);
597                                                         output.close();                                                 
598                                                 }
599                                         }
600                                 }
601                                 else{
602                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
603                                         currSeq.setUnaligned(origSeq);
604                                         currSeq.setAligned(origSeq);
605                                         currSeq.printSequence(scrapFASTAFile);
606                                         if(qFileName != ""){
607                                                 currQual.printQScores(scrapQualFile);
608                                         }
609                                 }
610                                 count++;
611                         }
612                         
613                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
614                                 unsigned long int pos = inFASTA.tellg();
615                                 if ((pos == -1) || (pos >= line->end)) { break; }
616                         #else
617                                 if (inFASTA.eof()) { break; }
618                         #endif
619                                 
620                         //report progress
621                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
622                         
623                 }
624                 //report progress
625                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
626
627                 
628                 inFASTA.close();
629                 trimFASTAFile.close();
630                 scrapFASTAFile.close();
631                 if (oligoFile != "") {   outGroupsFile.close();   }
632                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
633                 
634                 return count;
635         }
636         catch(exception& e) {
637                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
638                 exit(1);
639         }
640 }
641
642 /**************************************************************************************************/
643
644 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
645         try {
646 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
647                 int process = 1;
648                 int exitCommand = 1;
649                 processIDS.clear();
650                 
651                 //loop through and create all the processes you want
652                 while (process != processors) {
653                         int pid = fork();
654                         
655                         if (pid > 0) {
656                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
657                                 process++;
658                         }else if (pid == 0){
659                                 
660                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
661                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
662
663                                 if(allFiles){
664                                         ofstream temp;
665
666                                         for(int i=0;i<tempFASTAFileNames.size();i++){
667                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
668                                                         if (tempFASTAFileNames[i][j] != "") {
669                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
670                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
671
672                                                                 if(qFileName != ""){
673                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
674                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
675                                                                 }
676                                                         }
677                                                 }
678                                         }
679                                 }
680                                                         
681                                 driverCreateTrim(filename,
682                                                                  qFileName,
683                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
684                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
685                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
686                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
687                                                                  (groupFile + toString(getpid()) + ".temp"),
688                                                                  tempFASTAFileNames,
689                                                                  tempPrimerQualFileNames,
690                                                                  lines[process],
691                                                                  qLines[process]);
692                                 
693                                 //pass groupCounts to parent
694                                 ofstream out;
695                                 string tempFile = filename + toString(getpid()) + ".num.temp";
696                                 m->openOutputFile(tempFile, out);
697                                 for(int i = 0; i < groupCounts.size(); i++) {
698                                         out << groupCounts[i] << endl;
699                                 }
700                                 out.close();
701                                 
702                                 exit(0);
703                         }else { 
704                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
705                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
706                                 exit(0);
707                         }
708                 }
709                 
710                 //parent do my part
711                 ofstream temp;
712                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
713                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
714                 m->openOutputFile(trimQualFileName, temp);              temp.close();
715                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
716
717                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
718                 
719                 //force parent to wait until all the processes are done
720                 for (int i=0;i<processIDS.size();i++) { 
721                         int temp = processIDS[i];
722                         wait(&temp);
723                 }
724                 
725                 //append files
726                 for(int i=0;i<processIDS.size();i++){
727                         
728                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
729                         
730                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
731                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
732                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
733                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
734                         
735                         if(qFileName != ""){
736                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
737                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
738                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
739                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
740                         }
741                         
742                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
743                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
744                         
745                         
746                         if(allFiles){
747                                 for(int j=0;j<fastaFileNames.size();j++){
748                                         for(int k=0;k<fastaFileNames[j].size();k++){
749                                                 if (fastaFileNames[j][k] != "") {
750                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
751                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
752                                                         
753                                                         if(qFileName != ""){
754                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
755                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
756                                                         }
757                                                 }
758                                         }
759                                 }
760                         }
761                         
762                         ifstream in;
763                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
764                         m->openInputFile(tempFile, in);
765                         int count = 0; 
766                         int tempNum;
767                         while (!in.eof()) { 
768                                 in >> tempNum; m->gobble(in);
769                                 groupCounts[count] += tempNum; 
770                                 count++;
771                         }
772                         in.close(); remove(tempFile.c_str());
773                         
774                 }
775         
776                 return exitCommand;
777 #endif          
778         }
779         catch(exception& e) {
780                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
781                 exit(1);
782         }
783 }
784
785 /**************************************************************************************************/
786
787 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
788         try {
789                 
790                 //set file positions for fasta file
791                 fastaFilePos = m->divideFile(filename, processors);
792                 
793                 if (qfilename == "") { return processors; }
794                 
795                 //get name of first sequence in each chunk
796                 map<string, int> firstSeqNames;
797                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
798                         ifstream in;
799                         m->openInputFile(filename, in);
800                         in.seekg(fastaFilePos[i]);
801                 
802                         Sequence temp(in); 
803                         firstSeqNames[temp.getName()] = i;
804                 
805                         in.close();
806                 }
807                                 
808                 //seach for filePos of each first name in the qfile and save in qfileFilePos
809                 ifstream inQual;
810                 m->openInputFile(qfilename, inQual);
811                 
812                 string input;
813                 while(!inQual.eof()){   
814                         input = m->getline(inQual);
815
816                         if (input.length() != 0) {
817                                 if(input[0] == '>'){ //this is a sequence name line
818                                         istringstream nameStream(input);
819                                         
820                                         string sname = "";  nameStream >> sname;
821                                         sname = sname.substr(1);
822                                         
823                                         map<string, int>::iterator it = firstSeqNames.find(sname);
824                                         
825                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
826                                                 unsigned long int pos = inQual.tellg(); 
827                                                 qfileFilePos.push_back(pos - input.length() - 1);       
828                                                 firstSeqNames.erase(it);
829                                         }
830                                 }
831                         }
832                         
833                         if (firstSeqNames.size() == 0) { break; }
834                 }
835                 inQual.close();
836                 
837                 
838                 if (firstSeqNames.size() != 0) { 
839                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
840                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
841                         }
842                         qFileName = "";
843                         return processors;
844                 }
845
846                 //get last file position of qfile
847                 FILE * pFile;
848                 unsigned long int size;
849                 
850                 //get num bytes in file
851                 pFile = fopen (qfilename.c_str(),"rb");
852                 if (pFile==NULL) perror ("Error opening file");
853                 else{
854                         fseek (pFile, 0, SEEK_END);
855                         size=ftell (pFile);
856                         fclose (pFile);
857                 }
858                 
859                 qfileFilePos.push_back(size);
860                 
861                 return processors;
862         }
863         catch(exception& e) {
864                 m->errorOut(e, "TrimSeqsCommand", "setLines");
865                 exit(1);
866         }
867 }
868
869 //***************************************************************************************************************
870
871 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
872         try {
873                 ifstream inOligos;
874                 m->openInputFile(oligoFile, inOligos);
875                 
876                 ofstream test;
877                 
878                 string type, oligo, group;
879
880                 int indexPrimer = 0;
881                 int indexBarcode = 0;
882                 
883                 while(!inOligos.eof()){
884
885                         inOligos >> type; m->gobble(inOligos);
886                                         
887                         if(type[0] == '#'){
888                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
889                         }
890                         else{
891                                 //make type case insensitive
892                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
893                                 
894                                 inOligos >> oligo;
895                                 
896                                 for(int i=0;i<oligo.length();i++){
897                                         oligo[i] = toupper(oligo[i]);
898                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
899                                 }
900                                 
901                                 if(type == "FORWARD"){
902                                         group = "";
903                                         
904                                         // get rest of line in case there is a primer name
905                                         while (!inOligos.eof()) {       
906                                                 char c = inOligos.get(); 
907                                                 if (c == 10 || c == 13){        break;  }
908                                                 else if (c == 32 || c == 9){;} //space or tab
909                                                 else {  group += c;  }
910                                         } 
911                                         
912                                         //check for repeat barcodes
913                                         map<string, int>::iterator itPrime = primers.find(oligo);
914                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
915                                         
916                                         primers[oligo]=indexPrimer; indexPrimer++;              
917                                         primerNameVector.push_back(group);
918                                 }
919                                 else if(type == "REVERSE"){
920                                         Sequence oligoRC("reverse", oligo);
921                                         oligoRC.reverseComplement();
922                                         revPrimer.push_back(oligoRC.getUnaligned());
923                                 }
924                                 else if(type == "BARCODE"){
925                                         inOligos >> group;
926                                         
927                                         //check for repeat barcodes
928                                         map<string, int>::iterator itBar = barcodes.find(oligo);
929                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
930                                                 
931                                         barcodes[oligo]=indexBarcode; indexBarcode++;
932                                         barcodeNameVector.push_back(group);
933                                 }
934                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
935                         }
936                         m->gobble(inOligos);
937                 }       
938                 inOligos.close();
939                 
940                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
941                 
942                 //add in potential combos
943                 if(barcodeNameVector.size() == 0){
944                         barcodes[""] = 0;
945                         barcodeNameVector.push_back("");                        
946                 }
947                 
948                 if(primerNameVector.size() == 0){
949                         primers[""] = 0;
950                         primerNameVector.push_back("");                 
951                 }
952                 
953                 fastaFileNames.resize(barcodeNameVector.size());
954                 for(int i=0;i<fastaFileNames.size();i++){
955                         fastaFileNames[i].assign(primerNameVector.size(), "");
956                 }
957                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
958                 
959                 if(allFiles){
960                         set<string> uniqueNames; //used to cleanup outputFileNames
961                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
962                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
963                                         
964                                         string primerName = primerNameVector[itPrimer->second];
965                                         string barcodeName = barcodeNameVector[itBar->second];
966                                         
967                                         string comboGroupName = "";
968                                         string fastaFileName = "";
969                                         string qualFileName = "";
970                                         
971                                         if(primerName == ""){
972                                                 comboGroupName = barcodeNameVector[itBar->second];
973                                         }
974                                         else{
975                                                 if(barcodeName == ""){
976                                                         comboGroupName = primerNameVector[itPrimer->second];
977                                                 }
978                                                 else{
979                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
980                                                 }
981                                         }
982
983                                         ofstream temp;
984                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
985                                         if (uniqueNames.count(fastaFileName) == 0) {
986                                                 outputNames.push_back(fastaFileName);
987                                                 outputTypes["fasta"].push_back(fastaFileName);
988                                                 uniqueNames.insert(fastaFileName);
989                                         }
990                                         
991                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
992                                         m->openOutputFile(fastaFileName, temp);         temp.close();
993
994                                         if(qFileName != ""){
995                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
996                                                 if (uniqueNames.count(fastaFileName) == 0) {
997                                                         outputNames.push_back(qualFileName);
998                                                         outputTypes["qfile"].push_back(qualFileName);
999                                                 }
1000                                                 
1001                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1002                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1003                                         }
1004                                 }
1005                         }
1006                 }
1007                 numFPrimers = primers.size();
1008                 numRPrimers = revPrimer.size();
1009                 groupCounts.resize(barcodeNameVector.size(), 0);
1010
1011         }
1012         catch(exception& e) {
1013                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1014                 exit(1);
1015         }
1016 }
1017
1018 //***************************************************************************************************************
1019
1020 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1021         try {
1022                 
1023                 string rawSequence = seq.getUnaligned();
1024                 int success = bdiffs + 1;       //guilty until proven innocent
1025                 
1026                 //can you find the barcode
1027                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1028                         string oligo = it->first;
1029                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1030                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1031                                 break;  
1032                         }
1033                         
1034                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1035                                 group = it->second;
1036                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1037                                 
1038                                 if(qual.getName() != ""){
1039                                         qual.trimQScores(oligo.length(), -1);
1040                                 }
1041                                 
1042                                 success = 0;
1043                                 break;
1044                         }
1045                 }
1046                 
1047                 //if you found the barcode or if you don't want to allow for diffs
1048                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1049                 
1050                 else { //try aligning and see if you can find it
1051
1052                         int maxLength = 0;
1053
1054                         Alignment* alignment;
1055                         if (barcodes.size() > 0) {
1056                                 map<string,int>::iterator it=barcodes.begin();
1057
1058                                 for(it;it!=barcodes.end();it++){
1059                                         if(it->first.length() > maxLength){
1060                                                 maxLength = it->first.length();
1061                                         }
1062                                 }
1063                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1064
1065                         }else{ alignment = NULL; } 
1066                         
1067                         //can you find the barcode
1068                         int minDiff = 1e6;
1069                         int minCount = 1;
1070                         int minGroup = -1;
1071                         int minPos = 0;
1072                         
1073                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1074                                 string oligo = it->first;
1075 //                              int length = oligo.length();
1076                                 
1077                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1078                                         success = bdiffs + 10;
1079                                         break;
1080                                 }
1081                                 
1082                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1083                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1084                                 oligo = alignment->getSeqAAln();
1085                                 string temp = alignment->getSeqBAln();
1086                 
1087                                 int alnLength = oligo.length();
1088                                 
1089                                 for(int i=oligo.length()-1;i>=0;i--){
1090                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1091                                 }
1092                                 oligo = oligo.substr(0,alnLength);
1093                                 temp = temp.substr(0,alnLength);
1094                                 
1095                                 int numDiff = countDiffs(oligo, temp);
1096                                 
1097                                 if(numDiff < minDiff){
1098                                         minDiff = numDiff;
1099                                         minCount = 1;
1100                                         minGroup = it->second;
1101                                         minPos = 0;
1102                                         for(int i=0;i<alnLength;i++){
1103                                                 if(temp[i] != '-'){
1104                                                         minPos++;
1105                                                 }
1106                                         }
1107                                 }
1108                                 else if(numDiff == minDiff){
1109                                         minCount++;
1110                                 }
1111
1112                         }
1113
1114                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1115                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1116                         else{                                                                                                   //use the best match
1117                                 group = minGroup;
1118                                 seq.setUnaligned(rawSequence.substr(minPos));
1119                                 
1120                                 if(qual.getName() != ""){
1121                                         qual.trimQScores(minPos, -1);
1122                                 }
1123                                 success = minDiff;
1124                         }
1125                         
1126                         if (alignment != NULL) {  delete alignment;  }
1127                         
1128                 }
1129                 
1130                 return success;
1131                 
1132         }
1133         catch(exception& e) {
1134                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1135                 exit(1);
1136         }
1137
1138 }
1139
1140 //***************************************************************************************************************
1141
1142 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1143         try {
1144                 string rawSequence = seq.getUnaligned();
1145                 int success = pdiffs + 1;       //guilty until proven innocent
1146                 
1147                 //can you find the primer
1148                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1149                         string oligo = it->first;
1150                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1151                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1152                                 break;  
1153                         }
1154                         
1155                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1156                                 group = it->second;
1157                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1158                                 if(qual.getName() != ""){
1159                                         qual.trimQScores(oligo.length(), -1);
1160                                 }
1161                                 success = 0;
1162                                 break;
1163                         }
1164                 }
1165
1166                 //if you found the barcode or if you don't want to allow for diffs
1167                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1168                 
1169                 else { //try aligning and see if you can find it
1170
1171                         int maxLength = 0;
1172
1173                         Alignment* alignment;
1174                         if (primers.size() > 0) {
1175                                 map<string,int>::iterator it=primers.begin();
1176
1177                                 for(it;it!=primers.end();it++){
1178                                         if(it->first.length() > maxLength){
1179                                                 maxLength = it->first.length();
1180                                         }
1181                                 }
1182                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1183
1184                         }else{ alignment = NULL; } 
1185                         
1186                         //can you find the barcode
1187                         int minDiff = 1e6;
1188                         int minCount = 1;
1189                         int minGroup = -1;
1190                         int minPos = 0;
1191                         
1192                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1193                                 string oligo = it->first;
1194 //                              int length = oligo.length();
1195                                 
1196                                 if(rawSequence.length() < maxLength){   
1197                                         success = pdiffs + 100;
1198                                         break;
1199                                 }
1200                                 
1201                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1202                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1203                                 oligo = alignment->getSeqAAln();
1204                                 string temp = alignment->getSeqBAln();
1205                 
1206                                 int alnLength = oligo.length();
1207                                 
1208                                 for(int i=oligo.length()-1;i>=0;i--){
1209                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1210                                 }
1211                                 oligo = oligo.substr(0,alnLength);
1212                                 temp = temp.substr(0,alnLength);
1213                                 
1214                                 int numDiff = countDiffs(oligo, temp);
1215                                 
1216                                 if(numDiff < minDiff){
1217                                         minDiff = numDiff;
1218                                         minCount = 1;
1219                                         minGroup = it->second;
1220                                         minPos = 0;
1221                                         for(int i=0;i<alnLength;i++){
1222                                                 if(temp[i] != '-'){
1223                                                         minPos++;
1224                                                 }
1225                                         }
1226                                 }
1227                                 else if(numDiff == minDiff){
1228                                         minCount++;
1229                                 }
1230
1231                         }
1232
1233                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1234                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1235                         else{                                                                                                   //use the best match
1236                                 group = minGroup;
1237                                 seq.setUnaligned(rawSequence.substr(minPos));
1238                                 if(qual.getName() != ""){
1239                                         qual.trimQScores(minPos, -1);
1240                                 }
1241                                 success = minDiff;
1242                         }
1243                         
1244                         if (alignment != NULL) {  delete alignment;  }
1245                         
1246                 }
1247                 
1248                 return success;
1249
1250         }
1251         catch(exception& e) {
1252                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1253                 exit(1);
1254         }
1255 }
1256
1257 //***************************************************************************************************************
1258
1259 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1260         try {
1261                 string rawSequence = seq.getUnaligned();
1262                 bool success = 0;       //guilty until proven innocent
1263                 
1264                 for(int i=0;i<numRPrimers;i++){
1265                         string oligo = revPrimer[i];
1266                         
1267                         if(rawSequence.length() < oligo.length()){
1268                                 success = 0;
1269                                 break;
1270                         }
1271                         
1272                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1273                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1274                                 if(qual.getName() != ""){
1275                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1276                                 }
1277                                 success = 1;
1278                                 break;
1279                         }
1280                 }       
1281                 return success;
1282                 
1283         }
1284         catch(exception& e) {
1285                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1286                 exit(1);
1287         }
1288 }
1289
1290 //***************************************************************************************************************
1291
1292 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1293         try {
1294                 bool success = 1;
1295                 if(qscores.getName() != ""){
1296                         qscores.trimQScores(-1, keepFirst);
1297                 }
1298                 sequence.trim(keepFirst);
1299                 return success;
1300         }
1301         catch(exception& e) {
1302                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1303                 exit(1);
1304         }
1305         
1306 }       
1307
1308 //***************************************************************************************************************
1309
1310 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1311         try {
1312                 bool success = 0;
1313                 
1314                 int length = sequence.getNumBases() - removeLast;
1315                 
1316                 if(length > 0){
1317                         if(qscores.getName() != ""){
1318                                 qscores.trimQScores(-1, length);
1319                         }
1320                         sequence.trim(length);
1321                         success = 1;
1322                 }
1323                 else{
1324                         success = 0;
1325                 }
1326
1327                 return success;
1328         }
1329         catch(exception& e) {
1330                 m->errorOut(e, "removeLastTrim", "countDiffs");
1331                 exit(1);
1332         }
1333         
1334 }       
1335
1336 //***************************************************************************************************************
1337
1338 bool TrimSeqsCommand::cullLength(Sequence& seq){
1339         try {
1340         
1341                 int length = seq.getNumBases();
1342                 bool success = 0;       //guilty until proven innocent
1343                 
1344                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1345                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1346                 else                                                                                            {       success = 0;    }
1347                 
1348                 return success;
1349         
1350         }
1351         catch(exception& e) {
1352                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1353                 exit(1);
1354         }
1355         
1356 }
1357
1358 //***************************************************************************************************************
1359
1360 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1361         try {
1362                 int longHomoP = seq.getLongHomoPolymer();
1363                 bool success = 0;       //guilty until proven innocent
1364                 
1365                 if(longHomoP <= maxHomoP){      success = 1;    }
1366                 else                                    {       success = 0;    }
1367                 
1368                 return success;
1369         }
1370         catch(exception& e) {
1371                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1372                 exit(1);
1373         }
1374         
1375 }
1376
1377 //***************************************************************************************************************
1378
1379 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1380         try {
1381                 int numNs = seq.getAmbigBases();
1382                 bool success = 0;       //guilty until proven innocent
1383                 
1384                 if(numNs <= maxAmbig)   {       success = 1;    }
1385                 else                                    {       success = 0;    }
1386                 
1387                 return success;
1388         }
1389         catch(exception& e) {
1390                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1391                 exit(1);
1392         }
1393         
1394 }
1395
1396 //***************************************************************************************************************
1397
1398 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1399         try {
1400                 bool success = 1;
1401                 int length = oligo.length();
1402                 
1403                 for(int i=0;i<length;i++){
1404                         
1405                         if(oligo[i] != seq[i]){
1406                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1407                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1408                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1409                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1410                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1411                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1412                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1413                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1414                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1415                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1416                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1417                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1418                                 
1419                                 if(success == 0)        {       break;   }
1420                         }
1421                         else{
1422                                 success = 1;
1423                         }
1424                 }
1425                 
1426                 return success;
1427         }
1428         catch(exception& e) {
1429                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1430                 exit(1);
1431         }
1432
1433 }
1434
1435 //***************************************************************************************************************
1436
1437 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1438         try {
1439
1440                 int length = oligo.length();
1441                 int countDiffs = 0;
1442                 
1443                 for(int i=0;i<length;i++){
1444                                                                 
1445                         if(oligo[i] != seq[i]){
1446                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1447                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1448                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1449                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1450                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1451                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1452                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1453                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1454                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1455                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1456                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1457                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1458                         }
1459                         
1460                 }
1461                 
1462                 return countDiffs;
1463         }
1464         catch(exception& e) {
1465                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1466                 exit(1);
1467         }
1468
1469 }
1470
1471 //***************************************************************************************************************