]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
fix trim.seqs / qualscores bug
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimSeqsCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", 
18                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19                                                                         "keepfirst", "removelast",
20                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29
30 //**********************************************************************************************************************
31
32 TrimSeqsCommand::TrimSeqsCommand(){     
33         try {
34                 abort = true; calledHelp = true; 
35                 vector<string> tempOutNames;
36                 outputTypes["fasta"] = tempOutNames;
37                 outputTypes["qfile"] = tempOutNames;
38                 outputTypes["group"] = tempOutNames;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
42                 exit(1);
43         }
44 }
45
46 //**********************************************************************************************************************
47
48 vector<string> TrimSeqsCommand::getRequiredParameters(){        
49         try {
50                 string Array[] =  {"fasta"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59
60 //**********************************************************************************************************************
61
62 vector<string> TrimSeqsCommand::getRequiredFiles(){     
63         try {
64                 vector<string> myArray;
65                 return myArray;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
69                 exit(1);
70         }
71 }
72
73 //***************************************************************************************************************
74
75 TrimSeqsCommand::TrimSeqsCommand(string option)  {
76         try {
77                 
78                 abort = false; calledHelp = false;   
79                 comboStarts = 0;
80                 
81                 //allow user to run help
82                 if(option == "help") { help(); abort = true; calledHelp = true; }
83                 
84                 else {
85                         //valid paramters for this command
86                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
87                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88                                                                 "keepfirst", "removelast",
89                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
90                         
91                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
92                         
93                         OptionParser parser(option);
94                         map<string,string> parameters = parser.getParameters();
95                         
96                         ValidParameters validParameter;
97                         map<string,string>::iterator it;
98                         
99                         //check to make sure all parameters are valid for command
100                         for (it = parameters.begin(); it != parameters.end(); it++) { 
101                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
102                         }
103                         
104                         //initialize outputTypes
105                         vector<string> tempOutNames;
106                         outputTypes["fasta"] = tempOutNames;
107                         outputTypes["qfile"] = tempOutNames;
108                         outputTypes["group"] = tempOutNames;
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fasta");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("oligos");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
129                                 }
130                                 
131                                 it = parameters.find("qfile");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
137                                 }
138                                 
139                         }
140
141                         
142                         //check for required parameters
143                         fastaFile = validParameter.validFile(parameters, "fasta", true);
144                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145                         else if (fastaFile == "not open") { abort = true; }     
146                         
147                         //if the user changes the output directory command factory will send this info to us in the output parameter 
148                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
149                                 outputDir = ""; 
150                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
151                         }
152                 
153                         
154                         //check for optional parameter and set defaults
155                         // ...at some point should added some additional type checking...
156                         string temp;
157                         temp = validParameter.validFile(parameters, "flip", false);
158                         if (temp == "not found"){       flip = 0;       }
159                         else if(m->isTrue(temp))        {       flip = 1;       }
160                 
161                         temp = validParameter.validFile(parameters, "oligos", true);
162                         if (temp == "not found"){       oligoFile = "";         }
163                         else if(temp == "not open"){    abort = true;   } 
164                         else                                    {       oligoFile = temp;               }
165                         
166                         
167                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
168                         convert(temp, maxAmbig);  
169
170                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
171                         convert(temp, maxHomoP);  
172
173                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
174                         convert(temp, minLength); 
175                         
176                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
177                         convert(temp, maxLength);
178                         
179                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
180                         convert(temp, bdiffs);
181                         
182                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
183                         convert(temp, pdiffs);
184                         
185                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
186                         convert(temp, tdiffs);
187                         
188                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
189                         
190                         temp = validParameter.validFile(parameters, "qfile", true);     
191                         if (temp == "not found")        {       qFileName = "";         }
192                         else if(temp == "not open")     {       abort = true;           }
193                         else                                            {       qFileName = temp;       }
194                         
195                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
196                         convert(temp, qThreshold);
197                         
198                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
199                         qtrim = m->isTrue(temp);
200
201                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
202                         convert(temp, qRollAverage);
203
204                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205                         convert(temp, qWindowAverage);
206
207                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
208                         convert(temp, qWindowSize);
209
210                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
211                         convert(temp, qWindowStep);
212
213                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
214                         convert(temp, qAverage);
215
216                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
217                         convert(temp, keepFirst);
218
219                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
220                         convert(temp, removeLast);
221                         
222                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
223                         allFiles = m->isTrue(temp);
224                         
225                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
226                         convert(temp, processors); 
227                         
228                         
229                         if(allFiles && (oligoFile == "")){
230                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
231                         }
232                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
234                                 qAverage=0;
235                                 qThreshold=0;
236                         }
237                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
238                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
239                                 abort = true;
240                         }
241                 }
242
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
246                 exit(1);
247         }
248 }
249
250 //**********************************************************************************************************************
251
252 void TrimSeqsCommand::help(){
253         try {
254                 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255                 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257                 m->mothurOut("The fasta parameter is required.\n");
258                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259                 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260                 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261                 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262                 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263                 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267                 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268                 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269                 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270                 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271                 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272                 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273                 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274                 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275                 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276                 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277                 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278                 m->mothurOut("The trim.seqs command should be in the following format: \n");
279                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
280                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
281                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
284
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "TrimSeqsCommand", "help");
288                 exit(1);
289         }
290 }
291
292
293 //***************************************************************************************************************
294
295 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
296
297 //***************************************************************************************************************
298
299 int TrimSeqsCommand::execute(){
300         try{
301         
302                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
303                 
304                 numFPrimers = 0;  //this needs to be initialized
305                 numRPrimers = 0;
306                 vector<vector<string> > fastaFileNames;
307                 vector<vector<string> > qualFileNames;
308                 
309                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
311                 
312                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
314                 
315                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317                 if (qFileName != "") {
318                         outputNames.push_back(trimQualFile);
319                         outputNames.push_back(scrapQualFile);
320                         outputTypes["qfile"].push_back(trimQualFile);
321                         outputTypes["qfile"].push_back(scrapQualFile); 
322                 }
323                 
324                 string outputGroupFileName;
325                 if(oligoFile != ""){
326                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328                         getOligos(fastaFileNames, qualFileNames);
329                 }
330
331                 vector<unsigned long int> fastaFilePos;
332                 vector<unsigned long int> qFilePos;
333                 
334                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
335                 
336                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
339                 }       
340                 if(qFileName == "")     {       qLines = lines; } //files with duds
341                 
342                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
343                                 if(processors == 1){
344                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
345                                 }else{
346                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
347                                 }       
348                 #else
349                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
350                 #endif
351                 
352                 if (m->control_pressed) {  return 0; }                  
353                         
354                 if(allFiles){
355                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
356                         map<string, string>::iterator it;
357                         set<string> namesToRemove;
358                         for(int i=0;i<fastaFileNames.size();i++){
359                                 for(int j=0;j<fastaFileNames[0].size();j++){
360                                         if (fastaFileNames[i][j] != "") {
361                                                 if(m->isBlank(fastaFileNames[i][j])){
362                                                         remove(fastaFileNames[i][j].c_str());
363                                                         namesToRemove.insert(fastaFileNames[i][j]);
364                                                         
365                                                         if(qFileName != ""){
366                                                                 remove(qualFileNames[i][j].c_str());
367                                                                 namesToRemove.insert(qualFileNames[i][j]);
368                                                         }
369                                                 }else{  
370                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
371                                                         if (it == uniqueFastaNames.end()) {     
372                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
373                                                         }       
374                                                 }
375                                         }
376                                 }
377                         }
378                         
379                         //remove names for outputFileNames, just cleans up the output
380                         vector<string> outputNames2;
381                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
382                         outputNames = outputNames2;
383                         
384                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
385                                 ifstream in;
386                                 m->openInputFile(it->first, in);
387                                 
388                                 ofstream out;
389                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
390                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
391                                 m->openOutputFile(thisGroupName, out);
392                                 
393                                 while (!in.eof()){
394                                         if (m->control_pressed) { break; }
395                                         
396                                         Sequence currSeq(in); m->gobble(in);
397                                         out << currSeq.getName() << '\t' << it->second << endl;
398                                 }
399                                 in.close();
400                                 out.close();
401                         }
402                 }
403                 
404                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
405
406                 //output group counts
407                 m->mothurOutEndLine();
408                 int total = 0;
409 //              for (int i = 0; i < barcodeNameVector.size(); i++) {
410 //                      if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
411 //              }
412 //              if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
413                 
414                         if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
415
416                 m->mothurOutEndLine();
417                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
418                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
419                 m->mothurOutEndLine();
420                 
421                 return 0;       
422                         
423         }
424         catch(exception& e) {
425                 m->errorOut(e, "TrimSeqsCommand", "execute");
426                 exit(1);
427         }
428 }
429                 
430 /**************************************************************************************/
431
432 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
433                 
434         try {
435                 
436                 ofstream trimFASTAFile;
437                 m->openOutputFile(trimFileName, trimFASTAFile);
438                 
439                 ofstream scrapFASTAFile;
440                 m->openOutputFile(scrapFileName, scrapFASTAFile);
441                 
442                 ofstream trimQualFile;
443                 ofstream scrapQualFile;
444                 if(qFileName != ""){
445                         m->openOutputFile(trimQFileName, trimQualFile);
446                         m->openOutputFile(scrapQFileName, scrapQualFile);
447                 }
448                 
449                 ofstream outGroupsFile;
450                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
451                 if(allFiles){
452                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
453                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
454                                         if (fastaFileNames[i][j] != "") {
455                                                 ofstream temp;
456                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
457                                                 if(qFileName != ""){
458                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
459                                                 }
460                                         }
461                                 }
462                         }
463                 }
464                 
465                 ifstream inFASTA;
466                 m->openInputFile(filename, inFASTA);
467                 inFASTA.seekg(line->start);
468                 
469                 ifstream qFile;
470                 if(qFileName != "")     {
471                         m->openInputFile(qFileName, qFile);
472                         qFile.seekg(qline->start);  
473                 }
474                 
475                 int count = 0;
476                 bool moreSeqs = 1;
477         
478                 while (moreSeqs) {
479                                 
480                         if (m->control_pressed) { 
481                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
482                                 if (oligoFile != "") {   outGroupsFile.close();   }
483
484                                 if(qFileName != ""){
485                                         qFile.close();
486                                 }
487                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
488
489                                 return 0;
490                         }
491                         
492                         int success = 1;
493                         string trashCode = "";
494                         int currentSeqsDiffs = 0;
495
496                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
497
498                         QualityScores currQual;
499                         if(qFileName != ""){
500                                 currQual = QualityScores(qFile);  m->gobble(qFile);
501                         }
502
503                         string origSeq = currSeq.getUnaligned();
504                         if (origSeq != "") {
505                                 
506                                 int barcodeIndex = 0;
507                                 int primerIndex = 0;
508                                 
509                                 if(barcodes.size() != 0){
510                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
511                                         if(success > bdiffs)            {       trashCode += 'b';       }
512                                         else{ currentSeqsDiffs += success;  }
513                                 }
514                                 
515                                 if(numFPrimers != 0){
516                                         success = stripForward(currSeq, currQual, primerIndex);
517                                         if(success > pdiffs)            {       trashCode += 'f';       }
518                                         else{ currentSeqsDiffs += success;  }
519                                 }
520                                 
521                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
522                                 
523                                 if(numRPrimers != 0){
524                                         success = stripReverse(currSeq, currQual);
525                                         if(!success)                            {       trashCode += 'r';       }
526                                 }
527
528                                 if(keepFirst != 0){
529                                         success = keepFirstTrim(currSeq, currQual);
530                                 }
531                                 
532                                 if(removeLast != 0){
533                                         success = removeLastTrim(currSeq, currQual);
534                                         if(!success)                            {       trashCode += 'l';       }
535                                 }
536
537                                 
538                                 if(qFileName != ""){
539                                         int origLength = currSeq.getNumBases();
540                                         
541                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
542                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
543                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
544                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
545                                         else                                            {       success = 1;                            }
546                                         
547                                         //you don't want to trim, if it fails above then scrap it
548                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
549                                         
550                                         if(!success)                            {       trashCode += 'q';       }
551                                 }                               
552                 
553                                 if(minLength > 0 || maxLength > 0){
554                                         success = cullLength(currSeq);
555                                         if(!success)                            {       trashCode += 'l';       }
556                                 }
557                                 if(maxHomoP > 0){
558                                         success = cullHomoP(currSeq);
559                                         if(!success)                            {       trashCode += 'h';       }
560                                 }
561                                 if(maxAmbig != -1){
562                                         success = cullAmbigs(currSeq);
563                                         if(!success)                            {       trashCode += 'n';       }
564                                 }
565                                 
566                                 if(flip){               // should go last                       
567                                         currSeq.reverseComplement();
568                                         if(qFileName != ""){
569                                                 currQual.flipQScores(); 
570                                         }
571                                 }
572                                 
573                                 if(trashCode.length() == 0){
574                                         currSeq.setAligned(currSeq.getUnaligned());
575                                         currSeq.printSequence(trimFASTAFile);
576                                         
577                                         if(qFileName != ""){
578                                                 currQual.printQScores(trimQualFile);
579                                         }
580                                         
581                                         if(barcodes.size() != 0){
582                                                 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
583                                                 groupCounts[barcodeIndex]++;
584                                         }
585                                         
586                                         
587                                         if(allFiles){
588                                                 ofstream output;
589                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
590                                                 currSeq.printSequence(output);
591                                                 output.close();
592                                                 
593                                                 if(qFileName != ""){
594                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
595                                                         currQual.printQScores(output);
596                                                         output.close();                                                 
597                                                 }
598                                         }
599                                 }
600                                 else{
601                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
602                                         currSeq.setUnaligned(origSeq);
603                                         currSeq.setAligned(origSeq);
604                                         currSeq.printSequence(scrapFASTAFile);
605                                         if(qFileName != ""){
606                                                 currQual.printQScores(scrapQualFile);
607                                         }
608                                 }
609                                 count++;
610                         }
611                         
612                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
613                                 unsigned long int pos = inFASTA.tellg();
614                                 if ((pos == -1) || (pos >= line->end)) { break; }
615                         #else
616                                 if (inFASTA.eof()) { break; }
617                         #endif
618                                 
619                         //report progress
620                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
621                         
622                 }
623                 //report progress
624                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
625
626                 
627                 inFASTA.close();
628                 trimFASTAFile.close();
629                 scrapFASTAFile.close();
630                 if (oligoFile != "") {   outGroupsFile.close();   }
631                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
632                 
633                 return count;
634         }
635         catch(exception& e) {
636                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
637                 exit(1);
638         }
639 }
640
641 /**************************************************************************************************/
642
643 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
644         try {
645 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
646                 int process = 1;
647                 int exitCommand = 1;
648                 processIDS.clear();
649                 
650                 //loop through and create all the processes you want
651                 while (process != processors) {
652                         int pid = fork();
653                         
654                         if (pid > 0) {
655                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
656                                 process++;
657                         }else if (pid == 0){
658                                 
659                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
660                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
661
662                                 if(allFiles){
663                                         ofstream temp;
664
665                                         for(int i=0;i<tempFASTAFileNames.size();i++){
666                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
667                                                         if (tempFASTAFileNames[i][j] != "") {
668                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
669                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
670
671                                                                 if(qFileName != ""){
672                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
673                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
674                                                                 }
675                                                         }
676                                                 }
677                                         }
678                                 }
679                                                         
680                                 driverCreateTrim(filename,
681                                                                  qFileName,
682                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
683                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
684                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
685                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
686                                                                  (groupFile + toString(getpid()) + ".temp"),
687                                                                  tempFASTAFileNames,
688                                                                  tempPrimerQualFileNames,
689                                                                  lines[process],
690                                                                  qLines[process]);
691                                 
692                                 //pass groupCounts to parent
693                                 ofstream out;
694                                 string tempFile = filename + toString(getpid()) + ".num.temp";
695                                 m->openOutputFile(tempFile, out);
696                                 for(int i = 0; i < groupCounts.size(); i++) {
697                                         out << groupCounts[i] << endl;
698                                 }
699                                 out.close();
700                                 
701                                 exit(0);
702                         }else { 
703                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
704                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
705                                 exit(0);
706                         }
707                 }
708                 
709                 //parent do my part
710                 ofstream temp;
711                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
712                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
713                 m->openOutputFile(trimQualFileName, temp);              temp.close();
714                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
715
716                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
717                 
718                 //force parent to wait until all the processes are done
719                 for (int i=0;i<processIDS.size();i++) { 
720                         int temp = processIDS[i];
721                         wait(&temp);
722                 }
723                 
724                 //append files
725                 for(int i=0;i<processIDS.size();i++){
726                         
727                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
728                         
729                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
730                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
731                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
732                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
733                         
734                         if(qFileName != ""){
735                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
736                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
737                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
738                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
739                         }
740                         
741                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
742                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
743                         
744                         
745                         if(allFiles){
746                                 for(int j=0;j<fastaFileNames.size();j++){
747                                         for(int k=0;k<fastaFileNames[j].size();k++){
748                                                 if (fastaFileNames[j][k] != "") {
749                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
750                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
751                                                         
752                                                         if(qFileName != ""){
753                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
754                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
755                                                         }
756                                                 }
757                                         }
758                                 }
759                         }
760                         
761                         ifstream in;
762                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
763                         m->openInputFile(tempFile, in);
764                         int count = 0; 
765                         int tempNum;
766                         while (!in.eof()) { 
767                                 in >> tempNum; m->gobble(in);
768                                 groupCounts[count] += tempNum; 
769                                 count++;
770                         }
771                         in.close(); remove(tempFile.c_str());
772                         
773                 }
774         
775                 return exitCommand;
776 #endif          
777         }
778         catch(exception& e) {
779                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
780                 exit(1);
781         }
782 }
783
784 /**************************************************************************************************/
785
786 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
787         try {
788                 
789                 //set file positions for fasta file
790                 fastaFilePos = m->divideFile(filename, processors);
791                 
792                 if (qfilename == "") { return processors; }
793                 
794                 //get name of first sequence in each chunk
795                 map<string, int> firstSeqNames;
796                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
797                         ifstream in;
798                         m->openInputFile(filename, in);
799                         in.seekg(fastaFilePos[i]);
800                 
801                         Sequence temp(in); 
802                         firstSeqNames[temp.getName()] = i;
803                 
804                         in.close();
805                 }
806                                 
807                 //seach for filePos of each first name in the qfile and save in qfileFilePos
808                 ifstream inQual;
809                 m->openInputFile(qfilename, inQual);
810                 
811                 string input;
812                 while(!inQual.eof()){   
813                         input = m->getline(inQual);
814
815                         if (input.length() != 0) {
816                                 if(input[0] == '>'){ //this is a sequence name line
817                                         istringstream nameStream(input);
818                                         
819                                         string sname = "";  nameStream >> sname;
820                                         sname = sname.substr(1);
821                                         
822                                         map<string, int>::iterator it = firstSeqNames.find(sname);
823                                         
824                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
825                                                 unsigned long int pos = inQual.tellg(); 
826                                                 qfileFilePos.push_back(pos - input.length() - 1);       
827                                                 firstSeqNames.erase(it);
828                                         }
829                                 }
830                         }
831                         
832                         if (firstSeqNames.size() == 0) { break; }
833                 }
834                 inQual.close();
835                 
836                 
837                 if (firstSeqNames.size() != 0) { 
838                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
839                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
840                         }
841                         qFileName = "";
842                         return processors;
843                 }
844
845                 //get last file position of qfile
846                 FILE * pFile;
847                 unsigned long int size;
848                 
849                 //get num bytes in file
850                 pFile = fopen (qfilename.c_str(),"rb");
851                 if (pFile==NULL) perror ("Error opening file");
852                 else{
853                         fseek (pFile, 0, SEEK_END);
854                         size=ftell (pFile);
855                         fclose (pFile);
856                 }
857                 
858                 qfileFilePos.push_back(size);
859                 
860                 return processors;
861         }
862         catch(exception& e) {
863                 m->errorOut(e, "TrimSeqsCommand", "setLines");
864                 exit(1);
865         }
866 }
867
868 //***************************************************************************************************************
869
870 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
871         try {
872                 ifstream inOligos;
873                 m->openInputFile(oligoFile, inOligos);
874                 
875                 ofstream test;
876                 
877                 string type, oligo, group;
878
879                 int indexPrimer = 0;
880                 int indexBarcode = 0;
881                 
882                 while(!inOligos.eof()){
883
884                         inOligos >> type; m->gobble(inOligos);
885                                         
886                         if(type[0] == '#'){
887                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
888                         }
889                         else{
890                                 //make type case insensitive
891                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
892                                 
893                                 inOligos >> oligo;
894                                 
895                                 for(int i=0;i<oligo.length();i++){
896                                         oligo[i] = toupper(oligo[i]);
897                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
898                                 }
899                                 
900                                 if(type == "FORWARD"){
901                                         group = "";
902                                         
903                                         // get rest of line in case there is a primer name
904                                         while (!inOligos.eof()) {       
905                                                 char c = inOligos.get(); 
906                                                 if (c == 10 || c == 13){        break;  }
907                                                 else if (c == 32 || c == 9){;} //space or tab
908                                                 else {  group += c;  }
909                                         } 
910                                         
911                                         //check for repeat barcodes
912                                         map<string, int>::iterator itPrime = primers.find(oligo);
913                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
914                                         
915                                         primers[oligo]=indexPrimer; indexPrimer++;              
916                                         primerNameVector.push_back(group);
917                                 }
918                                 else if(type == "REVERSE"){
919                                         Sequence oligoRC("reverse", oligo);
920                                         oligoRC.reverseComplement();
921                                         revPrimer.push_back(oligoRC.getUnaligned());
922                                 }
923                                 else if(type == "BARCODE"){
924                                         inOligos >> group;
925                                         
926                                         //check for repeat barcodes
927                                         map<string, int>::iterator itBar = barcodes.find(oligo);
928                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
929                                                 
930                                         barcodes[oligo]=indexBarcode; indexBarcode++;
931                                         barcodeNameVector.push_back(group);
932                                 }
933                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
934                         }
935                         m->gobble(inOligos);
936                 }       
937                 inOligos.close();
938                 
939                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
940                 
941                 //add in potential combos
942                 if(barcodeNameVector.size() == 0){
943                         barcodes[""] = 0;
944                         barcodeNameVector.push_back("");                        
945                 }
946                 
947                 if(primerNameVector.size() == 0){
948                         primers[""] = 0;
949                         primerNameVector.push_back("");                 
950                 }
951                 
952                 fastaFileNames.resize(barcodeNameVector.size());
953                 for(int i=0;i<fastaFileNames.size();i++){
954                         fastaFileNames[i].assign(primerNameVector.size(), "");
955                 }
956                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
957                 
958                 if(allFiles){
959                         set<string> uniqueNames; //used to cleanup outputFileNames
960                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
961                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
962                                         
963                                         string primerName = primerNameVector[itPrimer->second];
964                                         string barcodeName = barcodeNameVector[itBar->second];
965                                         
966                                         string comboGroupName = "";
967                                         string fastaFileName = "";
968                                         string qualFileName = "";
969                                         
970                                         if(primerName == ""){
971                                                 comboGroupName = barcodeNameVector[itBar->second];
972                                         }
973                                         else{
974                                                 if(barcodeName == ""){
975                                                         comboGroupName = primerNameVector[itPrimer->second];
976                                                 }
977                                                 else{
978                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
979                                                 }
980                                         }
981
982                                         ofstream temp;
983                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
984                                         if (uniqueNames.count(fastaFileName) == 0) {
985                                                 outputNames.push_back(fastaFileName);
986                                                 outputTypes["fasta"].push_back(fastaFileName);
987                                                 uniqueNames.insert(fastaFileName);
988                                         }
989                                         
990                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
991                                         m->openOutputFile(fastaFileName, temp);         temp.close();
992
993                                         if(qFileName != ""){
994                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
995                                                 if (uniqueNames.count(fastaFileName) == 0) {
996                                                         outputNames.push_back(qualFileName);
997                                                         outputTypes["qfile"].push_back(qualFileName);
998                                                 }
999                                                 
1000                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1001                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1002                                         }
1003                                 }
1004                         }
1005                 }
1006                 numFPrimers = primers.size();
1007                 numRPrimers = revPrimer.size();
1008                 groupCounts.resize(barcodeNameVector.size(), 0);
1009
1010         }
1011         catch(exception& e) {
1012                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1013                 exit(1);
1014         }
1015 }
1016
1017 //***************************************************************************************************************
1018
1019 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1020         try {
1021                 
1022                 string rawSequence = seq.getUnaligned();
1023                 int success = bdiffs + 1;       //guilty until proven innocent
1024                 
1025                 //can you find the barcode
1026                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1027                         string oligo = it->first;
1028                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1029                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1030                                 break;  
1031                         }
1032                         
1033                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1034                                 group = it->second;
1035                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1036                                 
1037                                 if(qual.getName() != ""){
1038                                         qual.trimQScores(oligo.length(), -1);
1039                                 }
1040                                 
1041                                 success = 0;
1042                                 break;
1043                         }
1044                 }
1045                 
1046                 //if you found the barcode or if you don't want to allow for diffs
1047                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1048                 
1049                 else { //try aligning and see if you can find it
1050
1051                         int maxLength = 0;
1052
1053                         Alignment* alignment;
1054                         if (barcodes.size() > 0) {
1055                                 map<string,int>::iterator it=barcodes.begin();
1056
1057                                 for(it;it!=barcodes.end();it++){
1058                                         if(it->first.length() > maxLength){
1059                                                 maxLength = it->first.length();
1060                                         }
1061                                 }
1062                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1063
1064                         }else{ alignment = NULL; } 
1065                         
1066                         //can you find the barcode
1067                         int minDiff = 1e6;
1068                         int minCount = 1;
1069                         int minGroup = -1;
1070                         int minPos = 0;
1071                         
1072                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1073                                 string oligo = it->first;
1074 //                              int length = oligo.length();
1075                                 
1076                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1077                                         success = bdiffs + 10;
1078                                         break;
1079                                 }
1080                                 
1081                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1082                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1083                                 oligo = alignment->getSeqAAln();
1084                                 string temp = alignment->getSeqBAln();
1085                 
1086                                 int alnLength = oligo.length();
1087                                 
1088                                 for(int i=oligo.length()-1;i>=0;i--){
1089                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1090                                 }
1091                                 oligo = oligo.substr(0,alnLength);
1092                                 temp = temp.substr(0,alnLength);
1093                                 
1094                                 int numDiff = countDiffs(oligo, temp);
1095                                 
1096                                 if(numDiff < minDiff){
1097                                         minDiff = numDiff;
1098                                         minCount = 1;
1099                                         minGroup = it->second;
1100                                         minPos = 0;
1101                                         for(int i=0;i<alnLength;i++){
1102                                                 if(temp[i] != '-'){
1103                                                         minPos++;
1104                                                 }
1105                                         }
1106                                 }
1107                                 else if(numDiff == minDiff){
1108                                         minCount++;
1109                                 }
1110
1111                         }
1112
1113                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1114                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1115                         else{                                                                                                   //use the best match
1116                                 group = minGroup;
1117                                 seq.setUnaligned(rawSequence.substr(minPos));
1118                                 
1119                                 if(qual.getName() != ""){
1120                                         qual.trimQScores(minPos, -1);
1121                                 }
1122                                 success = minDiff;
1123                         }
1124                         
1125                         if (alignment != NULL) {  delete alignment;  }
1126                         
1127                 }
1128                 
1129                 return success;
1130                 
1131         }
1132         catch(exception& e) {
1133                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1134                 exit(1);
1135         }
1136
1137 }
1138
1139 //***************************************************************************************************************
1140
1141 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1142         try {
1143                 string rawSequence = seq.getUnaligned();
1144                 int success = pdiffs + 1;       //guilty until proven innocent
1145                 
1146                 //can you find the primer
1147                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1148                         string oligo = it->first;
1149                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1150                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1151                                 break;  
1152                         }
1153                         
1154                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1155                                 group = it->second;
1156                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1157                                 if(qual.getName() != ""){
1158                                         qual.trimQScores(oligo.length(), -1);
1159                                 }
1160                                 success = 0;
1161                                 break;
1162                         }
1163                 }
1164
1165                 //if you found the barcode or if you don't want to allow for diffs
1166                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1167                 
1168                 else { //try aligning and see if you can find it
1169
1170                         int maxLength = 0;
1171
1172                         Alignment* alignment;
1173                         if (primers.size() > 0) {
1174                                 map<string,int>::iterator it=primers.begin();
1175
1176                                 for(it;it!=primers.end();it++){
1177                                         if(it->first.length() > maxLength){
1178                                                 maxLength = it->first.length();
1179                                         }
1180                                 }
1181                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1182
1183                         }else{ alignment = NULL; } 
1184                         
1185                         //can you find the barcode
1186                         int minDiff = 1e6;
1187                         int minCount = 1;
1188                         int minGroup = -1;
1189                         int minPos = 0;
1190                         
1191                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1192                                 string oligo = it->first;
1193 //                              int length = oligo.length();
1194                                 
1195                                 if(rawSequence.length() < maxLength){   
1196                                         success = pdiffs + 100;
1197                                         break;
1198                                 }
1199                                 
1200                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1201                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1202                                 oligo = alignment->getSeqAAln();
1203                                 string temp = alignment->getSeqBAln();
1204                 
1205                                 int alnLength = oligo.length();
1206                                 
1207                                 for(int i=oligo.length()-1;i>=0;i--){
1208                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1209                                 }
1210                                 oligo = oligo.substr(0,alnLength);
1211                                 temp = temp.substr(0,alnLength);
1212                                 
1213                                 int numDiff = countDiffs(oligo, temp);
1214                                 
1215                                 if(numDiff < minDiff){
1216                                         minDiff = numDiff;
1217                                         minCount = 1;
1218                                         minGroup = it->second;
1219                                         minPos = 0;
1220                                         for(int i=0;i<alnLength;i++){
1221                                                 if(temp[i] != '-'){
1222                                                         minPos++;
1223                                                 }
1224                                         }
1225                                 }
1226                                 else if(numDiff == minDiff){
1227                                         minCount++;
1228                                 }
1229
1230                         }
1231
1232                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1233                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1234                         else{                                                                                                   //use the best match
1235                                 group = minGroup;
1236                                 seq.setUnaligned(rawSequence.substr(minPos));
1237                                 if(qual.getName() != ""){
1238                                         qual.trimQScores(minPos, -1);
1239                                 }
1240                                 success = minDiff;
1241                         }
1242                         
1243                         if (alignment != NULL) {  delete alignment;  }
1244                         
1245                 }
1246                 
1247                 return success;
1248
1249         }
1250         catch(exception& e) {
1251                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1252                 exit(1);
1253         }
1254 }
1255
1256 //***************************************************************************************************************
1257
1258 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1259         try {
1260                 string rawSequence = seq.getUnaligned();
1261                 bool success = 0;       //guilty until proven innocent
1262                 
1263                 for(int i=0;i<numRPrimers;i++){
1264                         string oligo = revPrimer[i];
1265                         
1266                         if(rawSequence.length() < oligo.length()){
1267                                 success = 0;
1268                                 break;
1269                         }
1270                         
1271                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1272                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1273                                 if(qual.getName() != ""){
1274                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1275                                 }
1276                                 success = 1;
1277                                 break;
1278                         }
1279                 }       
1280                 return success;
1281                 
1282         }
1283         catch(exception& e) {
1284                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1285                 exit(1);
1286         }
1287 }
1288
1289 //***************************************************************************************************************
1290
1291 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1292         try {
1293                 bool success = 1;
1294                 if(qscores.getName() != ""){
1295                         qscores.trimQScores(-1, keepFirst);
1296                 }
1297                 sequence.trim(keepFirst);
1298                 return success;
1299         }
1300         catch(exception& e) {
1301                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1302                 exit(1);
1303         }
1304         
1305 }       
1306
1307 //***************************************************************************************************************
1308
1309 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1310         try {
1311                 bool success = 0;
1312                 
1313                 int length = sequence.getNumBases() - removeLast;
1314                 
1315                 if(length > 0){
1316                         if(qscores.getName() != ""){
1317                                 qscores.trimQScores(-1, length);
1318                         }
1319                         sequence.trim(length);
1320                         success = 1;
1321                 }
1322                 else{
1323                         success = 0;
1324                 }
1325
1326                 return success;
1327         }
1328         catch(exception& e) {
1329                 m->errorOut(e, "removeLastTrim", "countDiffs");
1330                 exit(1);
1331         }
1332         
1333 }       
1334
1335 //***************************************************************************************************************
1336
1337 bool TrimSeqsCommand::cullLength(Sequence& seq){
1338         try {
1339         
1340                 int length = seq.getNumBases();
1341                 bool success = 0;       //guilty until proven innocent
1342                 
1343                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1344                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1345                 else                                                                                            {       success = 0;    }
1346                 
1347                 return success;
1348         
1349         }
1350         catch(exception& e) {
1351                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1352                 exit(1);
1353         }
1354         
1355 }
1356
1357 //***************************************************************************************************************
1358
1359 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1360         try {
1361                 int longHomoP = seq.getLongHomoPolymer();
1362                 bool success = 0;       //guilty until proven innocent
1363                 
1364                 if(longHomoP <= maxHomoP){      success = 1;    }
1365                 else                                    {       success = 0;    }
1366                 
1367                 return success;
1368         }
1369         catch(exception& e) {
1370                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1371                 exit(1);
1372         }
1373         
1374 }
1375
1376 //***************************************************************************************************************
1377
1378 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1379         try {
1380                 int numNs = seq.getAmbigBases();
1381                 bool success = 0;       //guilty until proven innocent
1382                 
1383                 if(numNs <= maxAmbig)   {       success = 1;    }
1384                 else                                    {       success = 0;    }
1385                 
1386                 return success;
1387         }
1388         catch(exception& e) {
1389                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1390                 exit(1);
1391         }
1392         
1393 }
1394
1395 //***************************************************************************************************************
1396
1397 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1398         try {
1399                 bool success = 1;
1400                 int length = oligo.length();
1401                 
1402                 for(int i=0;i<length;i++){
1403                         
1404                         if(oligo[i] != seq[i]){
1405                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1406                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1407                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1408                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1409                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1410                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1411                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1412                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1413                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1414                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1415                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1416                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1417                                 
1418                                 if(success == 0)        {       break;   }
1419                         }
1420                         else{
1421                                 success = 1;
1422                         }
1423                 }
1424                 
1425                 return success;
1426         }
1427         catch(exception& e) {
1428                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1429                 exit(1);
1430         }
1431
1432 }
1433
1434 //***************************************************************************************************************
1435
1436 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1437         try {
1438
1439                 int length = oligo.length();
1440                 int countDiffs = 0;
1441                 
1442                 for(int i=0;i<length;i++){
1443                                                                 
1444                         if(oligo[i] != seq[i]){
1445                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1446                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1447                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1448                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1449                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1450                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1451                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1452                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1453                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1454                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1455                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1456                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1457                         }
1458                         
1459                 }
1460                 
1461                 return countDiffs;
1462         }
1463         catch(exception& e) {
1464                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1465                 exit(1);
1466         }
1467
1468 }
1469
1470 //***************************************************************************************************************