]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
added homova command
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimSeqsCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
18                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19                                                                         "keepfirst", "removelast",
20                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29
30 //**********************************************************************************************************************
31
32 TrimSeqsCommand::TrimSeqsCommand(){     
33         try {
34                 abort = true;
35                 //initialize outputTypes
36                 vector<string> tempOutNames;
37                 outputTypes["fasta"] = tempOutNames;
38                 outputTypes["qual"] = tempOutNames;
39                 outputTypes["group"] = tempOutNames;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
43                 exit(1);
44         }
45 }
46
47 //**********************************************************************************************************************
48
49 vector<string> TrimSeqsCommand::getRequiredParameters(){        
50         try {
51                 string Array[] =  {"fasta"};
52                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                 return myArray;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
57                 exit(1);
58         }
59 }
60
61 //**********************************************************************************************************************
62
63 vector<string> TrimSeqsCommand::getRequiredFiles(){     
64         try {
65                 vector<string> myArray;
66                 return myArray;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
70                 exit(1);
71         }
72 }
73
74 //***************************************************************************************************************
75
76 TrimSeqsCommand::TrimSeqsCommand(string option)  {
77         try {
78                 
79                 abort = false;
80                 comboStarts = 0;
81                 
82                 //allow user to run help
83                 if(option == "help") { help(); abort = true; }
84                 
85                 else {
86                         //valid paramters for this command
87                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
88                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
89                                                                 "keepfirst", "removelast",
90                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91                         
92                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93                         
94                         OptionParser parser(option);
95                         map<string,string> parameters = parser.getParameters();
96                         
97                         ValidParameters validParameter;
98                         map<string,string>::iterator it;
99                         
100                         //check to make sure all parameters are valid for command
101                         for (it = parameters.begin(); it != parameters.end(); it++) { 
102                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
103                         }
104                         
105                         //initialize outputTypes
106                         vector<string> tempOutNames;
107                         outputTypes["fasta"] = tempOutNames;
108                         outputTypes["qual"] = tempOutNames;
109                         outputTypes["group"] = tempOutNames;
110                         
111                         //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("fasta");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
122                                 }
123                                 
124                                 it = parameters.find("oligos");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
130                                 }
131                                 
132                                 it = parameters.find("qfile");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
138                                 }
139                                 
140                                 it = parameters.find("group");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
146                                 }
147                         }
148
149                         
150                         //check for required parameters
151                         fastaFile = validParameter.validFile(parameters, "fasta", true);
152                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
153                         else if (fastaFile == "not open") { abort = true; }     
154                         
155                         //if the user changes the output directory command factory will send this info to us in the output parameter 
156                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
157                                 outputDir = ""; 
158                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
159                         }
160                 
161                         
162                         //check for optional parameter and set defaults
163                         // ...at some point should added some additional type checking...
164                         string temp;
165                         temp = validParameter.validFile(parameters, "flip", false);
166                         if (temp == "not found"){       flip = 0;       }
167                         else if(m->isTrue(temp))        {       flip = 1;       }
168                 
169                         temp = validParameter.validFile(parameters, "oligos", true);
170                         if (temp == "not found"){       oligoFile = "";         }
171                         else if(temp == "not open"){    abort = true;   } 
172                         else                                    {       oligoFile = temp;               }
173                         
174                         temp = validParameter.validFile(parameters, "group", true);
175                         if (temp == "not found"){       groupfile = "";         }
176                         else if(temp == "not open"){    abort = true;   } 
177                         else                                    {       groupfile = temp;               }
178                         
179                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
180                         convert(temp, maxAmbig);  
181
182                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
183                         convert(temp, maxHomoP);  
184
185                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
186                         convert(temp, minLength); 
187                         
188                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
189                         convert(temp, maxLength);
190                         
191                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
192                         convert(temp, bdiffs);
193                         
194                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
195                         convert(temp, pdiffs);
196                         
197                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
198                         convert(temp, tdiffs);
199                         
200                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
201                         
202                         temp = validParameter.validFile(parameters, "qfile", true);     
203                         if (temp == "not found")        {       qFileName = "";         }
204                         else if(temp == "not open")     {       abort = true;           }
205                         else                                            {       qFileName = temp;       }
206                         
207                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
208                         convert(temp, qThreshold);
209                         
210                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
211                         qtrim = m->isTrue(temp);
212
213                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
214                         convert(temp, qRollAverage);
215
216                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
217                         convert(temp, qWindowAverage);
218
219                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
220                         convert(temp, qWindowSize);
221
222                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
223                         convert(temp, qWindowStep);
224
225                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
226                         convert(temp, qAverage);
227
228                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
229                         convert(temp, keepFirst);
230
231                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
232                         convert(temp, removeLast);
233                         
234                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
235                         allFiles = m->isTrue(temp);
236                         
237                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
238                         convert(temp, processors); 
239                         
240                         if ((oligoFile != "") && (groupfile != "")) {
241                                 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
242                         }
243                                                                                                 
244                         
245                         if(allFiles && (oligoFile == "") && (groupfile == "")){
246                                 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file.  Ignoring the allfiles request."); m->mothurOutEndLine();
247                         }
248                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
249                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
250                                 qAverage=0;
251                                 qThreshold=0;
252                         }
253                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
254                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
255                                 abort = true;
256                         }
257                 }
258
259         }
260         catch(exception& e) {
261                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
262                 exit(1);
263         }
264 }
265
266 //**********************************************************************************************************************
267
268 void TrimSeqsCommand::help(){
269         try {
270                 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
271                 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
272                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
273                 m->mothurOut("The fasta parameter is required.\n");
274                 m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n");
275                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
276                 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
277                 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
278                 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
279                 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
280                 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
281                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
282                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
283                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
284                 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
285                 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
286                 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
287                 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
288                 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
289                 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
290                 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
291                 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
292                 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
293                 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
294                 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
295                 m->mothurOut("The trim.seqs command should be in the following format: \n");
296                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
297                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
298                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
299                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
300                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
301
302         }
303         catch(exception& e) {
304                 m->errorOut(e, "TrimSeqsCommand", "help");
305                 exit(1);
306         }
307 }
308
309
310 //***************************************************************************************************************
311
312 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
313
314 //***************************************************************************************************************
315
316 int TrimSeqsCommand::execute(){
317         try{
318         
319                 if (abort == true) { return 0; }
320                 
321                 numFPrimers = 0;  //this needs to be initialized
322                 numRPrimers = 0;
323                 vector<string> fastaFileNames;
324                 vector<string> qualFileNames;
325                 
326                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
327                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
328                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
329                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
330                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
331                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
332                 if (qFileName != "") {  outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile);  outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); }
333                 string groupFile = "";
334                 if (groupfile == "") { groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; }
335                 else{
336                         groupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "trim.groups";
337                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
338                         groupMap = new GroupMap(groupfile);
339                         groupMap->readMap();
340                         
341                         if(allFiles){
342                                 for (int i = 0; i < groupMap->namesOfGroups.size(); i++) {
343                                         groupToIndex[groupMap->namesOfGroups[i]] = i;
344                                         groupVector.push_back(groupMap->namesOfGroups[i]);
345                                         fastaFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) +  groupMap->namesOfGroups[i] + ".fasta"));
346                                         
347                                         //we append later, so we want to clear file
348                                         ofstream outRemove;
349                                         m->openOutputFile(fastaFileNames[i], outRemove);
350                                         outRemove.close();
351                                         if(qFileName != ""){
352                                                 qualFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) +  groupMap->namesOfGroups[i] + ".qual"));
353                                                 ofstream outRemove2;
354                                                 m->openOutputFile(qualFileNames[i], outRemove2);
355                                                 outRemove2.close();
356                                         }
357                                 }
358                         }
359                         comboStarts = fastaFileNames.size()-1;
360                 }
361                 
362                 if(oligoFile != ""){
363                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
364                         getOligos(fastaFileNames, qualFileNames);
365                 }
366
367                 vector<unsigned long int> fastaFilePos;
368                 vector<unsigned long int> qFilePos;
369                 
370                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
371                 
372                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
373                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
374                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
375                 }       
376                 if(qFileName == "")     {       qLines = lines; } //files with duds
377                 
378                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
379                                 if(processors == 1){
380                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
381                                 }else{
382                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); 
383                                 }       
384                 #else
385                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
386                 #endif
387                 
388                 if (m->control_pressed) {  return 0; }                  
389                         
390                 set<string> blanks;
391                 for(int i=0;i<fastaFileNames.size();i++){
392                         if (m->isBlank(fastaFileNames[i])) {   blanks.insert(fastaFileNames[i]);        }
393                         else if (filesToRemove.count(fastaFileNames[i]) > 0) {  remove(fastaFileNames[i].c_str()); }
394                         else {
395                                 ifstream inFASTA;
396                                 string seqName;
397                                 m->openInputFile(fastaFileNames[i], inFASTA);
398                                 ofstream outGroups;
399                                 string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups";
400                                 
401                                 //if the fastafile is on the blanks list then the groups file should be as well
402                                 if (blanks.count(fastaFileNames[i]) != 0) { blanks.insert(outGroupFilename); }
403                                 
404                                 m->openOutputFile(outGroupFilename, outGroups);
405                                 outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename);  
406                                 
407                                 string thisGroup = "";
408                                 if (i > comboStarts) {
409                                         map<string, int>::iterator itCombo;
410                                         for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
411                                                 if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
412                                         }
413                                 }else{ thisGroup = groupVector[i]; }
414                                         
415                                 while(!inFASTA.eof()){
416                                         if(inFASTA.get() == '>'){
417                                                 inFASTA >> seqName;
418                                                 outGroups << seqName << '\t' << thisGroup << endl;
419                                         }
420                                         while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
421                                 }
422                                 outGroups.close();
423                                 inFASTA.close();
424                         }
425                 }
426                 
427                 for (set<string>::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) {  remove((*(itBlanks)).c_str()); }
428                 
429                 blanks.clear();
430                 if(qFileName != ""){
431                         for(int i=0;i<qualFileNames.size();i++){
432                                 if (m->isBlank(qualFileNames[i])) {  blanks.insert(qualFileNames[i]);   }
433                                 else if (filesToRemove.count(qualFileNames[i]) > 0) {  remove(qualFileNames[i].c_str()); }
434                         }
435                 }
436                 
437                 for (set<string>::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) {  remove((*(itBlanks)).c_str()); }
438                 
439                 if (m->control_pressed) { 
440                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
441                         return 0;
442                 }
443
444                 m->mothurOutEndLine();
445                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
446                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
447                 m->mothurOutEndLine();
448                 
449                 return 0;       
450                         
451         }
452         catch(exception& e) {
453                 m->errorOut(e, "TrimSeqsCommand", "execute");
454                 exit(1);
455         }
456 }
457                 
458 /**************************************************************************************/
459
460 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames, linePair* line, linePair* qline) {      
461                 
462         try {
463                 
464                 ofstream outFASTA;
465                 m->openOutputFile(trimFile, outFASTA);
466                 
467                 ofstream scrapFASTA;
468                 m->openOutputFile(scrapFile, scrapFASTA);
469                 
470                 ofstream outQual;
471                 ofstream scrapQual;
472                 if(qFileName != ""){
473                         m->openOutputFile(trimQFile, outQual);
474                         m->openOutputFile(scrapQFile, scrapQual);
475                 }
476                 
477                 ofstream outGroups;
478                 
479                 if (oligoFile != "") {          
480                         m->openOutputFile(groupFile, outGroups);   
481                 }
482                 
483                 ifstream inFASTA;
484                 m->openInputFile(filename, inFASTA);
485                 inFASTA.seekg(line->start);
486                 
487                 ifstream qFile;
488                 if(qFileName != "")     {       m->openInputFile(qFileName, qFile);     qFile.seekg(qline->start);  }
489                 
490                 
491                 for (int i = 0; i < fastaNames.size(); i++) { //clears old file
492                         ofstream temp;
493                         m->openOutputFile(fastaNames[i], temp);
494                         temp.close();
495                 }
496                 for (int i = 0; i < qualNames.size(); i++) { //clears old file
497                         ofstream temp;
498                         m->openOutputFile(qualNames[i], temp);
499                         temp.close();
500                 }
501                 
502                         
503                 bool done = false;
504                 int count = 0;
505         
506                 while (!done) {
507                                 
508                         if (m->control_pressed) { 
509                                 inFASTA.close(); outFASTA.close(); scrapFASTA.close();
510                                 if (oligoFile != "") {   outGroups.close();   }
511
512                                 if(qFileName != ""){
513                                         qFile.close();
514                                 }
515                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
516
517                                 return 0;
518                         }
519                         
520                         int success = 1;
521                         
522
523                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
524
525                         QualityScores currQual;
526                         if(qFileName != ""){
527                                 currQual = QualityScores(qFile, currSeq.getNumBases());  m->gobble(qFile);
528                         }
529                         
530                         string origSeq = currSeq.getUnaligned();
531                         if (origSeq != "") {
532                                 int groupBar, groupPrime;
533                                 string trashCode = "";
534                                 int currentSeqsDiffs = 0;
535
536                                 if(barcodes.size() != 0){
537                                         success = stripBarcode(currSeq, currQual, groupBar);
538                                         if(success > bdiffs)            {       trashCode += 'b';       }
539                                         else{ currentSeqsDiffs += success;  }
540                                 }
541                                 
542                                 if(numFPrimers != 0){
543                                         success = stripForward(currSeq, currQual, groupPrime);
544                                         if(success > pdiffs)            {       trashCode += 'f';       }
545                                         else{ currentSeqsDiffs += success;  }
546                                 }
547                                 
548                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
549                                 
550                                 if(numRPrimers != 0){
551                                         success = stripReverse(currSeq, currQual);
552                                         if(!success)                            {       trashCode += 'r';       }
553                                 }
554
555                                 if(keepFirst != 0){
556                                         success = keepFirstTrim(currSeq, currQual);
557                                 }
558                                 
559                                 if(removeLast != 0){
560                                         success = removeLastTrim(currSeq, currQual);
561                                         if(!success)                            {       trashCode += 'l';       }
562                                 }
563
564                                 
565                                 if(qFileName != ""){
566                                         int origLength = currSeq.getNumBases();
567                                         
568                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
569                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
570                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
571                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
572                                         else                                            {       success = 1;                            }
573                                         
574                                         //you don't want to trim, if it fails above then scrap it
575                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
576                                         
577                                         if(!success)                            {       trashCode += 'q';       }
578                                 }                               
579                 
580                                 if(minLength > 0 || maxLength > 0){
581                                         success = cullLength(currSeq);
582                                         if(!success)                            {       trashCode += 'l';       }
583                                 }
584                                 if(maxHomoP > 0){
585                                         success = cullHomoP(currSeq);
586                                         if(!success)                            {       trashCode += 'h';       }
587                                 }
588                                 if(maxAmbig != -1){
589                                         success = cullAmbigs(currSeq);
590                                         if(!success)                            {       trashCode += 'n';       }
591                                 }
592                                 
593                                 if(flip){               // should go last                       
594                                         currSeq.reverseComplement();
595                                         currQual.flipQScores(); 
596                                 }
597                                 
598                                 if(trashCode.length() == 0){
599                                         currSeq.setAligned(currSeq.getUnaligned());
600                                         currSeq.printSequence(outFASTA);
601                                         currQual.printQScores(outQual);
602                                         
603                                         if(barcodes.size() != 0){
604                                                 string thisGroup = groupVector[groupBar];
605                                                 int indexToFastaFile = groupBar;
606                                                 if (primers.size() != 0){
607                                                         //does this primer have a group
608                                                         if (groupVector[groupPrime] != "") {  
609                                                                 thisGroup += "." + groupVector[groupPrime]; 
610                                                                 indexToFastaFile = combos[thisGroup];
611                                                         }
612                                                 }
613                                                 outGroups << currSeq.getName() << '\t' << thisGroup << endl;
614                                                 if(allFiles){
615                                                         ofstream outTemp;
616                                                         m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp);
617                                                         //currSeq.printSequence(*fastaFileNames[indexToFastaFile]);
618                                                         currSeq.printSequence(outTemp);
619                                                         outTemp.close();
620                                                         
621                                                         if(qFileName != ""){
622                                                                 //currQual.printQScores(*qualFileNames[indexToFastaFile]);
623                                                                 ofstream outTemp2;
624                                                                 m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2);
625                                                                 currQual.printQScores(outTemp2);
626                                                                 outTemp2.close();                                                       
627                                                         }
628                                                 }
629                                         }
630                                         
631                                         if (groupfile != "") {
632                                                 string thisGroup = groupMap->getGroup(currSeq.getName());
633                                                 
634                                                 if (thisGroup != "not found") {
635                                                         outGroups << currSeq.getName() << '\t' << thisGroup << endl;
636                                                         if (allFiles) {
637                                                                 ofstream outTemp;
638                                                                 m->openOutputFileAppend(fastaNames[groupToIndex[thisGroup]], outTemp);
639                                                                 currSeq.printSequence(outTemp);
640                                                                 outTemp.close();
641                                                                 if(qFileName != ""){
642                                                                         ofstream outTemp2;
643                                                                         m->openOutputFileAppend(qualNames[groupToIndex[thisGroup]], outTemp2);
644                                                                         currQual.printQScores(outTemp2);
645                                                                         outTemp2.close();                                                       
646                                                                 }
647                                                         }
648                                                 }else{
649                                                         m->mothurOut(currSeq.getName() + " is not in your groupfile, adding to group XXX."); m->mothurOutEndLine();
650                                                         outGroups << currSeq.getName() << '\t' << "XXX" << endl;
651                                                         if (allFiles) {  
652                                                                 m->mothurOut("[ERROR]: " + currSeq.getName() + " will not be added to any .group.fasta or .group.qual file."); m->mothurOutEndLine();
653                                                         }
654                                                 }
655                                         }
656                                 }
657                                 else{
658                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
659                                         currSeq.setUnaligned(origSeq);
660                                         currSeq.setAligned(origSeq);
661                                         currSeq.printSequence(scrapFASTA);
662                                         currQual.printQScores(scrapQual);
663                                 }
664                                 count++;
665                         }
666                         
667                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
668                                 unsigned long int pos = inFASTA.tellg();
669                                 if ((pos == -1) || (pos >= line->end)) { break; }
670                         #else
671                                 if (inFASTA.eof()) { break; }
672                         #endif
673                                 
674                         //report progress
675                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
676                         
677                 }
678                 //report progress
679                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
680
681                 
682                 inFASTA.close();
683                 outFASTA.close();
684                 scrapFASTA.close();
685                 if (oligoFile != "") {   outGroups.close();   }
686                 if(qFileName != "")     {       qFile.close();  scrapQual.close(); outQual.close();     }
687                 
688                 return count;
689         }
690         catch(exception& e) {
691                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
692                 exit(1);
693         }
694 }
695
696 /**************************************************************************************************/
697
698 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames) {
699         try {
700 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
701                 int process = 1;
702                 int exitCommand = 1;
703                 processIDS.clear();
704                 
705                 //loop through and create all the processes you want
706                 while (process != processors) {
707                         int pid = fork();
708                         
709                         if (pid > 0) {
710                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
711                                 process++;
712                         }else if (pid == 0){
713                                 for (int i = 0; i < fastaNames.size(); i++) {
714                                         fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp");
715                                         //clear old file if it exists
716                                         ofstream temp;
717                                         m->openOutputFile(fastaNames[i], temp);
718                                         temp.close();
719                                         if(qFileName != ""){
720                                                 qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp");
721                                                 //clear old file if it exists
722                                                 ofstream temp2;
723                                                 m->openOutputFile(qualNames[i], temp2);
724                                                 temp2.close();
725                                         }
726                                 }
727                                 
728                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]);
729                                 exit(0);
730                         }else { 
731                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
732                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
733                                 exit(0);
734                         }
735                 }
736                 
737                 //parent do my part
738                 for (int i = 0; i < fastaNames.size(); i++) {
739                         //clear old file if it exists
740                         ofstream temp;
741                         m->openOutputFile(fastaNames[i], temp);
742                         temp.close();
743                         if(qFileName != ""){
744                                 //clear old file if it exists
745                                 ofstream temp2;
746                                 m->openOutputFile(qualNames[i], temp2);
747                                 temp2.close();
748                         }
749                 }
750                 
751                 driverCreateTrim(filename, qFileName, trimFile, scrapFile, trimQFile, scrapQFile, groupFile, fastaNames, qualNames, lines[0], qLines[0]);
752                 
753                 
754                 //force parent to wait until all the processes are done
755                 for (int i=0;i<processIDS.size();i++) { 
756                         int temp = processIDS[i];
757                         wait(&temp);
758                 }
759                 
760                 //append files
761                 for(int i=0;i<processIDS.size();i++){
762                         
763                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
764                         
765                         m->appendFiles((trimFile + toString(processIDS[i]) + ".temp"), trimFile);
766                         remove((trimFile + toString(processIDS[i]) + ".temp").c_str());
767                         m->appendFiles((scrapFile + toString(processIDS[i]) + ".temp"), scrapFile);
768                         remove((scrapFile + toString(processIDS[i]) + ".temp").c_str());
769                         
770                         m->mothurOut("Done with fasta files"); m->mothurOutEndLine();
771                         
772                         if(qFileName != ""){
773                                 m->appendFiles((trimQFile + toString(processIDS[i]) + ".temp"), trimQFile);
774                                 remove((trimQFile + toString(processIDS[i]) + ".temp").c_str());
775                                 m->appendFiles((scrapQFile + toString(processIDS[i]) + ".temp"), scrapQFile);
776                                 remove((scrapQFile + toString(processIDS[i]) + ".temp").c_str());
777                         
778                                 m->mothurOut("Done with quality files"); m->mothurOutEndLine();
779                         }
780                         
781                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
782                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
783                         
784                         m->mothurOut("Done with group file"); m->mothurOutEndLine();
785                         
786                         for (int j = 0; j < fastaNames.size(); j++) {
787                                 m->appendFiles((fastaNames[j] + toString(processIDS[i]) + ".temp"), fastaNames[j]);
788                                 remove((fastaNames[j] + toString(processIDS[i]) + ".temp").c_str());
789                         }
790                         
791                         if(qFileName != ""){
792                                 for (int j = 0; j < qualNames.size(); j++) {
793                                         m->appendFiles((qualNames[j] + toString(processIDS[i]) + ".temp"), qualNames[j]);
794                                         remove((qualNames[j] + toString(processIDS[i]) + ".temp").c_str());
795                                 }
796                         }
797                         
798                         if (allFiles) { m->mothurOut("Done with allfiles"); m->mothurOutEndLine(); }
799                 }
800         
801                 return exitCommand;
802 #endif          
803         }
804         catch(exception& e) {
805                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
806                 exit(1);
807         }
808 }
809
810 /**************************************************************************************************/
811
812 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
813         try {
814                 
815                 //set file positions for fasta file
816                 fastaFilePos = m->divideFile(filename, processors);
817                 
818                 if (qfilename == "") { return processors; }
819                 
820                 //get name of first sequence in each chunk
821                 map<string, int> firstSeqNames;
822                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
823                         ifstream in;
824                         m->openInputFile(filename, in);
825                         in.seekg(fastaFilePos[i]);
826                 
827                         Sequence temp(in); 
828                         firstSeqNames[temp.getName()] = i;
829                 
830                         in.close();
831                 }
832                                 
833                 //seach for filePos of each first name in the qfile and save in qfileFilePos
834                 ifstream inQual;
835                 m->openInputFile(qfilename, inQual);
836                 
837                 string input;
838                 while(!inQual.eof()){   
839                         input = m->getline(inQual);
840
841                         if (input.length() != 0) {
842                                 if(input[0] == '>'){ //this is a sequence name line
843                                         istringstream nameStream(input);
844                                         
845                                         string sname = "";  nameStream >> sname;
846                                         sname = sname.substr(1);
847                                         
848                                         map<string, int>::iterator it = firstSeqNames.find(sname);
849                                         
850                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
851                                                 unsigned long int pos = inQual.tellg(); 
852                                                 qfileFilePos.push_back(pos - input.length() - 1);       
853                                                 firstSeqNames.erase(it);
854                                         }
855                                 }
856                         }
857                         
858                         if (firstSeqNames.size() == 0) { break; }
859                 }
860                 inQual.close();
861                 
862                 
863                 if (firstSeqNames.size() != 0) { 
864                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
865                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
866                         }
867                         qFileName = "";
868                         return processors;
869                 }
870
871                 //get last file position of qfile
872                 FILE * pFile;
873                 unsigned long int size;
874                 
875                 //get num bytes in file
876                 pFile = fopen (qfilename.c_str(),"rb");
877                 if (pFile==NULL) perror ("Error opening file");
878                 else{
879                         fseek (pFile, 0, SEEK_END);
880                         size=ftell (pFile);
881                         fclose (pFile);
882                 }
883                 
884                 qfileFilePos.push_back(size);
885                 
886                 return processors;
887         }
888         catch(exception& e) {
889                 m->errorOut(e, "TrimSeqsCommand", "setLines");
890                 exit(1);
891         }
892 }
893
894 //***************************************************************************************************************
895
896 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec, vector<string>& outQualVec){
897         try {
898                 ifstream inOligos;
899                 m->openInputFile(oligoFile, inOligos);
900                 
901                 ofstream test;
902                 
903                 string type, oligo, group;
904                 int index=0;
905                 //int indexPrimer = 0;
906                 
907                 while(!inOligos.eof()){
908                         inOligos >> type; m->gobble(inOligos);
909                                         
910                         if(type[0] == '#'){
911                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
912                         }
913                         else{
914                                 //make type case insensitive
915                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
916                                 
917                                 inOligos >> oligo;
918                                 
919                                 for(int i=0;i<oligo.length();i++){
920                                         oligo[i] = toupper(oligo[i]);
921                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
922                                 }
923                                 
924                                 if(type == "FORWARD"){
925                                         group = "";
926                                         
927                                         // get rest of line in case there is a primer name
928                                         while (!inOligos.eof()) {       
929                                                 char c = inOligos.get(); 
930                                                 if (c == 10 || c == 13){        break;  }
931                                                 else if (c == 32 || c == 9){;} //space or tab
932                                                 else {  group += c;  }
933                                         } 
934                                         
935                                         //check for repeat barcodes
936                                         map<string, int>::iterator itPrime = primers.find(oligo);
937                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
938                                         
939                                                 primers[oligo]=index; index++;
940                                                 groupVector.push_back(group);
941                                         
942                                                 if(allFiles){
943                                                         outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
944                                                         if(qFileName != ""){
945                                                                 outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
946                                                         }
947                                                         if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct
948                                                                 filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
949                                                                 if(qFileName != ""){
950                                                                         filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
951                                                                 }
952                                                         }else {
953                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
954                                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
955                                                                 if(qFileName != ""){
956                                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
957                                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
958                                                                 }                                                       
959                                                         }
960                                                 }
961                                         
962                                 }
963                                 else if(type == "REVERSE"){
964                                         Sequence oligoRC("reverse", oligo);
965                                         oligoRC.reverseComplement();
966                                         revPrimer.push_back(oligoRC.getUnaligned());
967                                 }
968                                 else if(type == "BARCODE"){
969                                         inOligos >> group;
970                                         
971                                         //check for repeat barcodes
972                                         map<string, int>::iterator itBar = barcodes.find(oligo);
973                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
974                                         
975                                                 barcodes[oligo]=index; index++;
976                                                 groupVector.push_back(group);
977                                                 
978                                                 if(allFiles){
979                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
980                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
981                                                         outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
982                                                         if(qFileName != ""){
983                                                                 outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
984                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
985                                                                 outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
986                                                         }                                                       
987                                                 }
988                                         
989                                 }else{  m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
990                         }
991                         m->gobble(inOligos);
992                 }
993                 
994                 inOligos.close();
995                 
996                 //add in potential combos
997                 if(allFiles){
998                         comboStarts = outFASTAVec.size()-1;
999                         for (map<string, int>::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) {
1000                                 for (map<string, int>::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) {
1001                                         if (groupVector[itPrime->second] != "") { //there is a group for this primer
1002                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1003                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1004                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1005                                                 combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1;
1006                                                 
1007                                                 if(qFileName != ""){
1008                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1009                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1010                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1011                                                 }
1012                                         }
1013                                 }
1014                         }
1015                 }
1016                 
1017                 numFPrimers = primers.size();
1018                 numRPrimers = revPrimer.size();
1019                 
1020         }
1021         catch(exception& e) {
1022                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1023                 exit(1);
1024         }
1025 }
1026 //***************************************************************************************************************
1027
1028 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1029         try {
1030                 
1031                 string rawSequence = seq.getUnaligned();
1032                 int success = bdiffs + 1;       //guilty until proven innocent
1033                 
1034                 //can you find the barcode
1035                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1036                         string oligo = it->first;
1037                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1038                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1039                                 break;  
1040                         }
1041                         
1042                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1043                                 group = it->second;
1044                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1045                                 
1046                                 if(qual.getName() != ""){
1047                                         qual.trimQScores(oligo.length(), -1);
1048                                 }
1049                                 
1050                                 success = 0;
1051                                 break;
1052                         }
1053                 }
1054                 
1055                 //if you found the barcode or if you don't want to allow for diffs
1056 //              cout << success;
1057                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1058                 
1059                 else { //try aligning and see if you can find it
1060 //                      cout << endl;
1061
1062                         int maxLength = 0;
1063
1064                         Alignment* alignment;
1065                         if (barcodes.size() > 0) {
1066                                 map<string,int>::iterator it=barcodes.begin();
1067
1068                                 for(it;it!=barcodes.end();it++){
1069                                         if(it->first.length() > maxLength){
1070                                                 maxLength = it->first.length();
1071                                         }
1072                                 }
1073                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1074
1075                         }else{ alignment = NULL; } 
1076                         
1077                         //can you find the barcode
1078                         int minDiff = 1e6;
1079                         int minCount = 1;
1080                         int minGroup = -1;
1081                         int minPos = 0;
1082                         
1083                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1084                                 string oligo = it->first;
1085 //                              int length = oligo.length();
1086                                 
1087                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1088                                         success = bdiffs + 10;
1089                                         break;
1090                                 }
1091                                 
1092                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1093                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1094                                 oligo = alignment->getSeqAAln();
1095                                 string temp = alignment->getSeqBAln();
1096                 
1097                                 int alnLength = oligo.length();
1098                                 
1099                                 for(int i=oligo.length()-1;i>=0;i--){
1100                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1101                                 }
1102                                 oligo = oligo.substr(0,alnLength);
1103                                 temp = temp.substr(0,alnLength);
1104                                 
1105                                 int numDiff = countDiffs(oligo, temp);
1106                                 
1107 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1108                                 
1109                                 if(numDiff < minDiff){
1110                                         minDiff = numDiff;
1111                                         minCount = 1;
1112                                         minGroup = it->second;
1113                                         minPos = 0;
1114                                         for(int i=0;i<alnLength;i++){
1115                                                 if(temp[i] != '-'){
1116                                                         minPos++;
1117                                                 }
1118                                         }
1119                                 }
1120                                 else if(numDiff == minDiff){
1121                                         minCount++;
1122                                 }
1123
1124                         }
1125
1126                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1127                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1128                         else{                                                                                                   //use the best match
1129                                 group = minGroup;
1130                                 seq.setUnaligned(rawSequence.substr(minPos));
1131                                 
1132                                 if(qual.getName() != ""){
1133                                         qual.trimQScores(minPos, -1);
1134                                 }
1135                                 success = minDiff;
1136                         }
1137                         
1138                         if (alignment != NULL) {  delete alignment;  }
1139                         
1140                 }
1141 //              cout << success << endl;
1142                 
1143                 return success;
1144                 
1145         }
1146         catch(exception& e) {
1147                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1148                 exit(1);
1149         }
1150
1151 }
1152
1153 //***************************************************************************************************************
1154
1155 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1156         try {
1157                 string rawSequence = seq.getUnaligned();
1158                 int success = pdiffs + 1;       //guilty until proven innocent
1159                 
1160                 //can you find the primer
1161                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1162                         string oligo = it->first;
1163                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1164                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1165                                 break;  
1166                         }
1167                         
1168                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1169                                 group = it->second;
1170                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1171                                 if(qual.getName() != ""){
1172                                         qual.trimQScores(oligo.length(), -1);
1173                                 }
1174                                 success = 0;
1175                                 break;
1176                         }
1177                 }
1178
1179                 //if you found the barcode or if you don't want to allow for diffs
1180 //              cout << success;
1181                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1182                 
1183                 else { //try aligning and see if you can find it
1184 //                      cout << endl;
1185
1186                         int maxLength = 0;
1187
1188                         Alignment* alignment;
1189                         if (primers.size() > 0) {
1190                                 map<string,int>::iterator it=primers.begin();
1191
1192                                 for(it;it!=primers.end();it++){
1193                                         if(it->first.length() > maxLength){
1194                                                 maxLength = it->first.length();
1195                                         }
1196                                 }
1197                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1198
1199                         }else{ alignment = NULL; } 
1200                         
1201                         //can you find the barcode
1202                         int minDiff = 1e6;
1203                         int minCount = 1;
1204                         int minGroup = -1;
1205                         int minPos = 0;
1206                         
1207                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1208                                 string oligo = it->first;
1209 //                              int length = oligo.length();
1210                                 
1211                                 if(rawSequence.length() < maxLength){   
1212                                         success = pdiffs + 100;
1213                                         break;
1214                                 }
1215                                 
1216                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1217                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1218                                 oligo = alignment->getSeqAAln();
1219                                 string temp = alignment->getSeqBAln();
1220                 
1221                                 int alnLength = oligo.length();
1222                                 
1223                                 for(int i=oligo.length()-1;i>=0;i--){
1224                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1225                                 }
1226                                 oligo = oligo.substr(0,alnLength);
1227                                 temp = temp.substr(0,alnLength);
1228                                 
1229                                 int numDiff = countDiffs(oligo, temp);
1230                                 
1231 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1232                                 
1233                                 if(numDiff < minDiff){
1234                                         minDiff = numDiff;
1235                                         minCount = 1;
1236                                         minGroup = it->second;
1237                                         minPos = 0;
1238                                         for(int i=0;i<alnLength;i++){
1239                                                 if(temp[i] != '-'){
1240                                                         minPos++;
1241                                                 }
1242                                         }
1243                                 }
1244                                 else if(numDiff == minDiff){
1245                                         minCount++;
1246                                 }
1247
1248                         }
1249
1250                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1251                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1252                         else{                                                                                                   //use the best match
1253                                 group = minGroup;
1254                                 seq.setUnaligned(rawSequence.substr(minPos));
1255                                 if(qual.getName() != ""){
1256                                         qual.trimQScores(minPos, -1);
1257                                 }
1258                                 success = minDiff;
1259                         }
1260                         
1261                         if (alignment != NULL) {  delete alignment;  }
1262                         
1263                 }
1264                 
1265                 return success;
1266
1267         }
1268         catch(exception& e) {
1269                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1270                 exit(1);
1271         }
1272 }
1273
1274 //***************************************************************************************************************
1275
1276 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1277         try {
1278                 string rawSequence = seq.getUnaligned();
1279                 bool success = 0;       //guilty until proven innocent
1280                 
1281                 for(int i=0;i<numRPrimers;i++){
1282                         string oligo = revPrimer[i];
1283                         
1284                         if(rawSequence.length() < oligo.length()){
1285                                 success = 0;
1286                                 break;
1287                         }
1288                         
1289                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1290                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1291                                 if(qual.getName() != ""){
1292                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1293                                 }
1294                                 success = 1;
1295                                 break;
1296                         }
1297                 }       
1298                 return success;
1299                 
1300         }
1301         catch(exception& e) {
1302                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1303                 exit(1);
1304         }
1305 }
1306
1307 //***************************************************************************************************************
1308
1309 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1310         try {
1311                 bool success = 1;
1312                 if(qscores.getName() != ""){
1313                         qscores.trimQScores(-1, keepFirst);
1314                 }
1315                 sequence.trim(keepFirst);
1316                 return success;
1317         }
1318         catch(exception& e) {
1319                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1320                 exit(1);
1321         }
1322         
1323 }       
1324
1325 //***************************************************************************************************************
1326
1327 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1328         try {
1329                 bool success = 0;
1330                 
1331                 int length = sequence.getNumBases() - removeLast;
1332                 
1333                 if(length > 0){
1334                         if(qscores.getName() != ""){
1335                                 qscores.trimQScores(-1, length);
1336                         }
1337                         sequence.trim(length);
1338                         success = 1;
1339                 }
1340                 else{
1341                         success = 0;
1342                 }
1343
1344                 return success;
1345         }
1346         catch(exception& e) {
1347                 m->errorOut(e, "removeLastTrim", "countDiffs");
1348                 exit(1);
1349         }
1350         
1351 }       
1352
1353 //***************************************************************************************************************
1354
1355 bool TrimSeqsCommand::cullLength(Sequence& seq){
1356         try {
1357         
1358                 int length = seq.getNumBases();
1359                 bool success = 0;       //guilty until proven innocent
1360                 
1361                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1362                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1363                 else                                                                                            {       success = 0;    }
1364                 
1365                 return success;
1366         
1367         }
1368         catch(exception& e) {
1369                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1370                 exit(1);
1371         }
1372         
1373 }
1374
1375 //***************************************************************************************************************
1376
1377 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1378         try {
1379                 int longHomoP = seq.getLongHomoPolymer();
1380                 bool success = 0;       //guilty until proven innocent
1381                 
1382                 if(longHomoP <= maxHomoP){      success = 1;    }
1383                 else                                    {       success = 0;    }
1384                 
1385                 return success;
1386         }
1387         catch(exception& e) {
1388                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1389                 exit(1);
1390         }
1391         
1392 }
1393
1394 //***************************************************************************************************************
1395
1396 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1397         try {
1398                 int numNs = seq.getAmbigBases();
1399                 bool success = 0;       //guilty until proven innocent
1400                 
1401                 if(numNs <= maxAmbig)   {       success = 1;    }
1402                 else                                    {       success = 0;    }
1403                 
1404                 return success;
1405         }
1406         catch(exception& e) {
1407                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1408                 exit(1);
1409         }
1410         
1411 }
1412
1413 //***************************************************************************************************************
1414
1415 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1416         try {
1417                 bool success = 1;
1418                 int length = oligo.length();
1419                 
1420                 for(int i=0;i<length;i++){
1421                         
1422                         if(oligo[i] != seq[i]){
1423                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1424                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1425                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1426                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1427                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1428                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1429                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1430                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1431                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1432                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1433                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1434                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1435                                 
1436                                 if(success == 0)        {       break;   }
1437                         }
1438                         else{
1439                                 success = 1;
1440                         }
1441                 }
1442                 
1443                 return success;
1444         }
1445         catch(exception& e) {
1446                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1447                 exit(1);
1448         }
1449
1450 }
1451
1452 //***************************************************************************************************************
1453
1454 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1455         try {
1456
1457                 int length = oligo.length();
1458                 int countDiffs = 0;
1459                 
1460                 for(int i=0;i<length;i++){
1461                                                                 
1462                         if(oligo[i] != seq[i]){
1463                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1464                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1465                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1466                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1467                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1468                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1469                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1470                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1471                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1472                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1473                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1474                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1475                         }
1476                         
1477                 }
1478                 
1479                 return countDiffs;
1480         }
1481         catch(exception& e) {
1482                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1483                 exit(1);
1484         }
1485
1486 }
1487
1488 //***************************************************************************************************************