]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
trim.seqs fix of gobble
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19                 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
20                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
21                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
22                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
23                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
24                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
25                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
26                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
29                 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
30                 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
31                 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
32                 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
33                 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
34                 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
35                 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
36                 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
37                 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                         
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimSeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string TrimSeqsCommand::getHelpString(){        
52         try {
53                 string helpString = "";
54                 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
55                 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
56                 helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
57                 helpString += "The fasta parameter is required.\n";
58                 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
59                 helpString += "The oligos parameter allows you to provide an oligos file.\n";
60                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
61                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
62                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
63                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
64                 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
65                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
66                 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
67                 helpString += "The qfile parameter allows you to provide a quality file.\n";
68                 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
69                 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
70                 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
71                 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
72                 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
73                 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
74                 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
75                 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
76                 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
77                 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
78                 helpString += "The trim.seqs command should be in the following format: \n";
79                 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n";
80                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
81                 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
82                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
83                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
84                 return helpString;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
88                 exit(1);
89         }
90 }
91
92
93 //**********************************************************************************************************************
94
95 TrimSeqsCommand::TrimSeqsCommand(){     
96         try {
97                 abort = true; calledHelp = true; 
98                 setParameters();
99                 vector<string> tempOutNames;
100                 outputTypes["fasta"] = tempOutNames;
101                 outputTypes["qfile"] = tempOutNames;
102                 outputTypes["group"] = tempOutNames;
103         }
104         catch(exception& e) {
105                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
106                 exit(1);
107         }
108 }
109 //***************************************************************************************************************
110
111 TrimSeqsCommand::TrimSeqsCommand(string option)  {
112         try {
113                 
114                 abort = false; calledHelp = false;   
115                 comboStarts = 0;
116                 
117                 //allow user to run help
118                 if(option == "help") { help(); abort = true; calledHelp = true; }
119                 
120                 else {
121                         vector<string> myArray = setParameters();
122                         
123                         OptionParser parser(option);
124                         map<string,string> parameters = parser.getParameters();
125                         
126                         ValidParameters validParameter;
127                         map<string,string>::iterator it;
128                         
129                         //check to make sure all parameters are valid for command
130                         for (it = parameters.begin(); it != parameters.end(); it++) { 
131                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
132                         }
133                         
134                         //initialize outputTypes
135                         vector<string> tempOutNames;
136                         outputTypes["fasta"] = tempOutNames;
137                         outputTypes["qfile"] = tempOutNames;
138                         outputTypes["group"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("fasta");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
151                                 }
152                                 
153                                 it = parameters.find("oligos");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
159                                 }
160                                 
161                                 it = parameters.find("qfile");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
167                                 }
168                                 
169                         }
170
171                         
172                         //check for required parameters
173                         fastaFile = validParameter.validFile(parameters, "fasta", true);
174                         if (fastaFile == "not found") {                                 
175                                 fastaFile = m->getFastaFile(); 
176                                 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
177                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
178                         }else if (fastaFile == "not open") { abort = true; }    
179                         
180                         //if the user changes the output directory command factory will send this info to us in the output parameter 
181                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
182                                 outputDir = ""; 
183                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
184                         }
185                 
186                         
187                         //check for optional parameter and set defaults
188                         // ...at some point should added some additional type checking...
189                         string temp;
190                         temp = validParameter.validFile(parameters, "flip", false);
191                         if (temp == "not found"){       flip = 0;       }
192                         else if(m->isTrue(temp))        {       flip = 1;       }
193                 
194                         temp = validParameter.validFile(parameters, "oligos", true);
195                         if (temp == "not found"){       oligoFile = "";         }
196                         else if(temp == "not open"){    abort = true;   } 
197                         else                                    {       oligoFile = temp;               }
198                         
199                         
200                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
201                         convert(temp, maxAmbig);  
202
203                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
204                         convert(temp, maxHomoP);  
205
206                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
207                         convert(temp, minLength); 
208                         
209                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
210                         convert(temp, maxLength);
211                         
212                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
213                         convert(temp, bdiffs);
214                         
215                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
216                         convert(temp, pdiffs);
217                         
218                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
219                         convert(temp, tdiffs);
220                         
221                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
222                         
223                         temp = validParameter.validFile(parameters, "qfile", true);     
224                         if (temp == "not found")        {       qFileName = "";         }
225                         else if(temp == "not open")     {       abort = true;           }
226                         else                                            {       qFileName = temp;       }
227                         
228                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
229                         convert(temp, qThreshold);
230                         
231                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
232                         qtrim = m->isTrue(temp);
233
234                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
235                         convert(temp, qRollAverage);
236
237                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
238                         convert(temp, qWindowAverage);
239
240                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
241                         convert(temp, qWindowSize);
242
243                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
244                         convert(temp, qWindowStep);
245
246                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
247                         convert(temp, qAverage);
248
249                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
250                         convert(temp, keepFirst);
251
252                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
253                         convert(temp, removeLast);
254                         
255                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
256                         allFiles = m->isTrue(temp);
257                         
258                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
259                         m->setProcessors(temp);
260                         convert(temp, processors); 
261                         
262                         
263                         if(allFiles && (oligoFile == "")){
264                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
265                         }
266                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
267                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
268                                 qAverage=0;
269                                 qThreshold=0;
270                         }
271                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
272                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
273                                 abort = true;
274                         }
275                 }
276
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
280                 exit(1);
281         }
282 }
283 //***************************************************************************************************************
284
285 int TrimSeqsCommand::execute(){
286         try{
287         
288                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
289                 
290                 numFPrimers = 0;  //this needs to be initialized
291                 numRPrimers = 0;
292                 vector<vector<string> > fastaFileNames;
293                 vector<vector<string> > qualFileNames;
294                 
295                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
296                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
297                 
298                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
299                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
300                 
301                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
302                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
303                 if (qFileName != "") {
304                         outputNames.push_back(trimQualFile);
305                         outputNames.push_back(scrapQualFile);
306                         outputTypes["qfile"].push_back(trimQualFile);
307                         outputTypes["qfile"].push_back(scrapQualFile); 
308                 }
309                 
310                 string outputGroupFileName;
311                 if(oligoFile != ""){
312                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
313                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
314                         getOligos(fastaFileNames, qualFileNames);
315                 }
316
317                 vector<unsigned long int> fastaFilePos;
318                 vector<unsigned long int> qFilePos;
319                 
320                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
321                 
322                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
323                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
324                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
325                 }       
326                 if(qFileName == "")     {       qLines = lines; } //files with duds
327                 
328                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
329                                 if(processors == 1){
330                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
331                                 }else{
332                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
333                                 }       
334                 #else
335                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
336                 #endif
337                 
338                 if (m->control_pressed) {  return 0; }                  
339                         
340                 if(allFiles){
341                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
342                         map<string, string>::iterator it;
343                         set<string> namesToRemove;
344                         for(int i=0;i<fastaFileNames.size();i++){
345                                 for(int j=0;j<fastaFileNames[0].size();j++){
346                                         if (fastaFileNames[i][j] != "") {
347                                                 if(m->isBlank(fastaFileNames[i][j])){
348                                                         remove(fastaFileNames[i][j].c_str());
349                                                         namesToRemove.insert(fastaFileNames[i][j]);
350                                                         
351                                                         if(qFileName != ""){
352                                                                 remove(qualFileNames[i][j].c_str());
353                                                                 namesToRemove.insert(qualFileNames[i][j]);
354                                                         }
355                                                 }else{  
356                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
357                                                         if (it == uniqueFastaNames.end()) {     
358                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
359                                                         }       
360                                                 }
361                                         }
362                                 }
363                         }
364                         
365                         //remove names for outputFileNames, just cleans up the output
366                         vector<string> outputNames2;
367                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
368                         outputNames = outputNames2;
369                         
370                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
371                                 ifstream in;
372                                 m->openInputFile(it->first, in);
373                                 
374                                 ofstream out;
375                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
376                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
377                                 m->openOutputFile(thisGroupName, out);
378                                 
379                                 while (!in.eof()){
380                                         if (m->control_pressed) { break; }
381                                         
382                                         Sequence currSeq(in); m->gobble(in);
383                                         out << currSeq.getName() << '\t' << it->second << endl;
384                                 }
385                                 in.close();
386                                 out.close();
387                         }
388                 }
389                 
390                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
391
392                 //output group counts
393                 m->mothurOutEndLine();
394                 int total = 0;
395                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
396                          total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); 
397                 }
398                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
399                 
400                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
401
402                 //set fasta file as new current fastafile
403                 string current = "";
404                 itTypes = outputTypes.find("fasta");
405                 if (itTypes != outputTypes.end()) {
406                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
407                 }
408                 
409                 itTypes = outputTypes.find("qfile");
410                 if (itTypes != outputTypes.end()) {
411                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
412                 }
413                 
414                 itTypes = outputTypes.find("group");
415                 if (itTypes != outputTypes.end()) {
416                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
417                 }
418
419                 m->mothurOutEndLine();
420                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
421                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
422                 m->mothurOutEndLine();
423                 
424                 return 0;       
425                         
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "TrimSeqsCommand", "execute");
429                 exit(1);
430         }
431 }
432                 
433 /**************************************************************************************/
434
435 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
436                 
437         try {
438                 
439                 ofstream trimFASTAFile;
440                 m->openOutputFile(trimFileName, trimFASTAFile);
441                 
442                 ofstream scrapFASTAFile;
443                 m->openOutputFile(scrapFileName, scrapFASTAFile);
444                 
445                 ofstream trimQualFile;
446                 ofstream scrapQualFile;
447                 if(qFileName != ""){
448                         m->openOutputFile(trimQFileName, trimQualFile);
449                         m->openOutputFile(scrapQFileName, scrapQualFile);
450                 }
451                 
452                 ofstream outGroupsFile;
453                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
454                 if(allFiles){
455                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
456                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
457                                         if (fastaFileNames[i][j] != "") {
458                                                 ofstream temp;
459                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
460                                                 if(qFileName != ""){
461                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
462                                                 }
463                                         }
464                                 }
465                         }
466                 }
467                 
468                 ifstream inFASTA;
469                 m->openInputFile(filename, inFASTA);
470                 inFASTA.seekg(line->start);
471                 
472                 ifstream qFile;
473                 if(qFileName != "")     {
474                         m->openInputFile(qFileName, qFile);
475                         qFile.seekg(qline->start);  
476                 }
477                 
478                 int count = 0;
479                 bool moreSeqs = 1;
480         
481                 while (moreSeqs) {
482                                 
483                         if (m->control_pressed) { 
484                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
485                                 if (oligoFile != "") {   outGroupsFile.close();   }
486
487                                 if(qFileName != ""){
488                                         qFile.close();
489                                 }
490                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
491
492                                 return 0;
493                         }
494                         
495                         int success = 1;
496                         string trashCode = "";
497                         int currentSeqsDiffs = 0;
498
499                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
500
501                         QualityScores currQual;
502                         if(qFileName != ""){
503                                 currQual = QualityScores(qFile);  m->gobble(qFile);
504                         }
505
506                         string origSeq = currSeq.getUnaligned();
507                         if (origSeq != "") {
508                                 
509                                 int barcodeIndex = 0;
510                                 int primerIndex = 0;
511                                 
512                                 if(barcodes.size() != 0){
513                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
514                                         if(success > bdiffs)            {       trashCode += 'b';       }
515                                         else{ currentSeqsDiffs += success;  }
516                                 }
517                                 
518                                 if(numFPrimers != 0){
519                                         success = stripForward(currSeq, currQual, primerIndex);
520                                         if(success > pdiffs)            {       trashCode += 'f';       }
521                                         else{ currentSeqsDiffs += success;  }
522                                 }
523                                 
524                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
525                                 
526                                 if(numRPrimers != 0){
527                                         success = stripReverse(currSeq, currQual);
528                                         if(!success)                            {       trashCode += 'r';       }
529                                 }
530
531                                 if(keepFirst != 0){
532                                         success = keepFirstTrim(currSeq, currQual);
533                                 }
534                                 
535                                 if(removeLast != 0){
536                                         success = removeLastTrim(currSeq, currQual);
537                                         if(!success)                            {       trashCode += 'l';       }
538                                 }
539
540                                 
541                                 if(qFileName != ""){
542                                         int origLength = currSeq.getNumBases();
543                                         
544                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
545                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
546                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
547                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
548                                         else                                            {       success = 1;                            }
549                                         
550                                         //you don't want to trim, if it fails above then scrap it
551                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
552                                         
553                                         if(!success)                            {       trashCode += 'q';       }
554                                 }                               
555                 
556                                 if(minLength > 0 || maxLength > 0){
557                                         success = cullLength(currSeq);
558                                         if(!success)                            {       trashCode += 'l';       }
559                                 }
560                                 if(maxHomoP > 0){
561                                         success = cullHomoP(currSeq);
562                                         if(!success)                            {       trashCode += 'h';       }
563                                 }
564                                 if(maxAmbig != -1){
565                                         success = cullAmbigs(currSeq);
566                                         if(!success)                            {       trashCode += 'n';       }
567                                 }
568                                 
569                                 if(flip){               // should go last                       
570                                         currSeq.reverseComplement();
571                                         if(qFileName != ""){
572                                                 currQual.flipQScores(); 
573                                         }
574                                 }
575                                 
576                                 if(trashCode.length() == 0){
577                                         currSeq.setAligned(currSeq.getUnaligned());
578                                         currSeq.printSequence(trimFASTAFile);
579                                         
580                                         if(qFileName != ""){
581                                                 currQual.printQScores(trimQualFile);
582                                         }
583                                         
584                                         if(barcodes.size() != 0){
585                                                 string thisGroup = barcodeNameVector[barcodeIndex];
586                                                 if (primers.size() != 0) { thisGroup += "." + primerNameVector[primerIndex]; }
587                                                 
588                                                 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
589                                                 
590                                                 map<string, int>::iterator it = groupCounts.find(thisGroup);
591                                                 if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
592                                                 else { groupCounts[it->first]++; }
593                                                         
594                                         }
595                                         
596                                         
597                                         if(allFiles){
598                                                 ofstream output;
599                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
600                                                 currSeq.printSequence(output);
601                                                 output.close();
602                                                 
603                                                 if(qFileName != ""){
604                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
605                                                         currQual.printQScores(output);
606                                                         output.close();                                                 
607                                                 }
608                                         }
609                                 }
610                                 else{
611                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
612                                         currSeq.setUnaligned(origSeq);
613                                         currSeq.setAligned(origSeq);
614                                         currSeq.printSequence(scrapFASTAFile);
615                                         if(qFileName != ""){
616                                                 currQual.printQScores(scrapQualFile);
617                                         }
618                                 }
619                                 count++;
620                         }
621                         
622                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
623                                 unsigned long int pos = inFASTA.tellg();
624                                 if ((pos == -1) || (pos >= line->end)) { break; }
625                         #else
626                                 if (inFASTA.eof()) { break; }
627                         #endif
628                                 
629                         //report progress
630                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
631                         
632                 }
633                 //report progress
634                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
635
636                 
637                 inFASTA.close();
638                 trimFASTAFile.close();
639                 scrapFASTAFile.close();
640                 if (oligoFile != "") {   outGroupsFile.close();   }
641                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
642                 
643                 return count;
644         }
645         catch(exception& e) {
646                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
647                 exit(1);
648         }
649 }
650
651 /**************************************************************************************************/
652
653 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
654         try {
655 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
656                 int process = 1;
657                 int exitCommand = 1;
658                 processIDS.clear();
659                 
660                 //loop through and create all the processes you want
661                 while (process != processors) {
662                         int pid = fork();
663                         
664                         if (pid > 0) {
665                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
666                                 process++;
667                         }else if (pid == 0){
668                                 
669                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
670                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
671
672                                 if(allFiles){
673                                         ofstream temp;
674
675                                         for(int i=0;i<tempFASTAFileNames.size();i++){
676                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
677                                                         if (tempFASTAFileNames[i][j] != "") {
678                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
679                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
680
681                                                                 if(qFileName != ""){
682                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
683                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
684                                                                 }
685                                                         }
686                                                 }
687                                         }
688                                 }
689                                                         
690                                 driverCreateTrim(filename,
691                                                                  qFileName,
692                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
693                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
694                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
695                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
696                                                                  (groupFile + toString(getpid()) + ".temp"),
697                                                                  tempFASTAFileNames,
698                                                                  tempPrimerQualFileNames,
699                                                                  lines[process],
700                                                                  qLines[process]);
701                                 
702                                 //pass groupCounts to parent
703                                 ofstream out;
704                                 string tempFile = filename + toString(getpid()) + ".num.temp";
705                                 m->openOutputFile(tempFile, out);
706                                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
707                                         out << it->first << '\t' << it->second << endl;
708                                 }
709                                 out.close();
710                                 
711                                 exit(0);
712                         }else { 
713                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
714                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
715                                 exit(0);
716                         }
717                 }
718                 
719                 //parent do my part
720                 ofstream temp;
721                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
722                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
723                 m->openOutputFile(trimQualFileName, temp);              temp.close();
724                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
725
726                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
727                 
728                 //force parent to wait until all the processes are done
729                 for (int i=0;i<processIDS.size();i++) { 
730                         int temp = processIDS[i];
731                         wait(&temp);
732                 }
733                 
734                 //append files
735                 for(int i=0;i<processIDS.size();i++){
736                         
737                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
738                         
739                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
740                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
741                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
742                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
743                         
744                         if(qFileName != ""){
745                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
746                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
747                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
748                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
749                         }
750                         
751                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
752                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
753                         
754                         
755                         if(allFiles){
756                                 for(int j=0;j<fastaFileNames.size();j++){
757                                         for(int k=0;k<fastaFileNames[j].size();k++){
758                                                 if (fastaFileNames[j][k] != "") {
759                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
760                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
761                                                         
762                                                         if(qFileName != ""){
763                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
764                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
765                                                         }
766                                                 }
767                                         }
768                                 }
769                         }
770                         
771                         ifstream in;
772                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
773                         m->openInputFile(tempFile, in);
774                         int tempNum;
775                         string group;
776                         while (!in.eof()) { 
777                                 in >> group >> tempNum; m->gobble(in);
778                                 
779                                 map<string, int>::iterator it = groupCounts.find(group);
780                                 if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
781                                 else { groupCounts[it->first] += tempNum; }
782                         }
783                         in.close(); remove(tempFile.c_str());
784                         
785                 }
786         
787                 return exitCommand;
788 #endif          
789         }
790         catch(exception& e) {
791                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
792                 exit(1);
793         }
794 }
795
796 /**************************************************************************************************/
797
798 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
799         try {
800                 
801                 //set file positions for fasta file
802                 fastaFilePos = m->divideFile(filename, processors);
803                 
804                 if (qfilename == "") { return processors; }
805                 
806                 //get name of first sequence in each chunk
807                 map<string, int> firstSeqNames;
808                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
809                         ifstream in;
810                         m->openInputFile(filename, in);
811                         in.seekg(fastaFilePos[i]);
812                 
813                         Sequence temp(in); 
814                         firstSeqNames[temp.getName()] = i;
815                 
816                         in.close();
817                 }
818                                 
819                 //seach for filePos of each first name in the qfile and save in qfileFilePos
820                 ifstream inQual;
821                 m->openInputFile(qfilename, inQual);
822                 
823                 string input;
824                 while(!inQual.eof()){   
825                         input = m->getline(inQual);
826
827                         if (input.length() != 0) {
828                                 if(input[0] == '>'){ //this is a sequence name line
829                                         istringstream nameStream(input);
830                                         
831                                         string sname = "";  nameStream >> sname;
832                                         sname = sname.substr(1);
833                                         
834                                         map<string, int>::iterator it = firstSeqNames.find(sname);
835                                         
836                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
837                                                 unsigned long int pos = inQual.tellg(); 
838                                                 qfileFilePos.push_back(pos - input.length() - 1);       
839                                                 firstSeqNames.erase(it);
840                                         }
841                                 }
842                         }
843                         
844                         if (firstSeqNames.size() == 0) { break; }
845                 }
846                 inQual.close();
847                 
848                 
849                 if (firstSeqNames.size() != 0) { 
850                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
851                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
852                         }
853                         qFileName = "";
854                         return processors;
855                 }
856
857                 //get last file position of qfile
858                 FILE * pFile;
859                 unsigned long int size;
860                 
861                 //get num bytes in file
862                 pFile = fopen (qfilename.c_str(),"rb");
863                 if (pFile==NULL) perror ("Error opening file");
864                 else{
865                         fseek (pFile, 0, SEEK_END);
866                         size=ftell (pFile);
867                         fclose (pFile);
868                 }
869                 
870                 qfileFilePos.push_back(size);
871                 
872                 return processors;
873         }
874         catch(exception& e) {
875                 m->errorOut(e, "TrimSeqsCommand", "setLines");
876                 exit(1);
877         }
878 }
879
880 //***************************************************************************************************************
881
882 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
883         try {
884                 ifstream inOligos;
885                 m->openInputFile(oligoFile, inOligos);
886                 
887                 ofstream test;
888                 
889                 string type, oligo, group;
890
891                 int indexPrimer = 0;
892                 int indexBarcode = 0;
893                 
894                 while(!inOligos.eof()){
895
896                         inOligos >> type; 
897                                         
898                         if(type[0] == '#'){
899                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
900                                 m->gobble(inOligos);
901                         }
902                         else{
903                                 m->gobble(inOligos);
904                                 //make type case insensitive
905                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
906                                 
907                                 inOligos >> oligo;
908                                 
909                                 for(int i=0;i<oligo.length();i++){
910                                         oligo[i] = toupper(oligo[i]);
911                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
912                                 }
913                                 
914                                 if(type == "FORWARD"){
915                                         group = "";
916                                         
917                                         // get rest of line in case there is a primer name
918                                         while (!inOligos.eof()) {       
919                                                 char c = inOligos.get(); 
920                                                 if (c == 10 || c == 13){        break;  }
921                                                 else if (c == 32 || c == 9){;} //space or tab
922                                                 else {  group += c;  }
923                                         } 
924                                         
925                                         //check for repeat barcodes
926                                         map<string, int>::iterator itPrime = primers.find(oligo);
927                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
928                                         
929                                         primers[oligo]=indexPrimer; indexPrimer++;              
930                                         primerNameVector.push_back(group);
931                                 }
932                                 else if(type == "REVERSE"){
933                                         Sequence oligoRC("reverse", oligo);
934                                         oligoRC.reverseComplement();
935                                         revPrimer.push_back(oligoRC.getUnaligned());
936                                 }
937                                 else if(type == "BARCODE"){
938                                         inOligos >> group;
939                                         
940                                         //check for repeat barcodes
941                                         map<string, int>::iterator itBar = barcodes.find(oligo);
942                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
943                                                 
944                                         barcodes[oligo]=indexBarcode; indexBarcode++;
945                                         barcodeNameVector.push_back(group);
946                                 }
947                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
948                         }
949                         m->gobble(inOligos);
950                 }       
951                 inOligos.close();
952                 
953                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
954                 
955                 //add in potential combos
956                 if(barcodeNameVector.size() == 0){
957                         barcodes[""] = 0;
958                         barcodeNameVector.push_back("");                        
959                 }
960                 
961                 if(primerNameVector.size() == 0){
962                         primers[""] = 0;
963                         primerNameVector.push_back("");                 
964                 }
965                 
966                 fastaFileNames.resize(barcodeNameVector.size());
967                 for(int i=0;i<fastaFileNames.size();i++){
968                         fastaFileNames[i].assign(primerNameVector.size(), "");
969                 }
970                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
971                 
972                 if(allFiles){
973                         set<string> uniqueNames; //used to cleanup outputFileNames
974                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
975                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
976                                         
977                                         string primerName = primerNameVector[itPrimer->second];
978                                         string barcodeName = barcodeNameVector[itBar->second];
979                                         
980                                         string comboGroupName = "";
981                                         string fastaFileName = "";
982                                         string qualFileName = "";
983                                         
984                                         if(primerName == ""){
985                                                 comboGroupName = barcodeNameVector[itBar->second];
986                                         }
987                                         else{
988                                                 if(barcodeName == ""){
989                                                         comboGroupName = primerNameVector[itPrimer->second];
990                                                 }
991                                                 else{
992                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
993                                                 }
994                                         }
995
996                                         ofstream temp;
997                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
998                                         if (uniqueNames.count(fastaFileName) == 0) {
999                                                 outputNames.push_back(fastaFileName);
1000                                                 outputTypes["fasta"].push_back(fastaFileName);
1001                                                 uniqueNames.insert(fastaFileName);
1002                                         }
1003                                         
1004                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1005                                         m->openOutputFile(fastaFileName, temp);         temp.close();
1006
1007                                         if(qFileName != ""){
1008                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1009                                                 if (uniqueNames.count(fastaFileName) == 0) {
1010                                                         outputNames.push_back(qualFileName);
1011                                                         outputTypes["qfile"].push_back(qualFileName);
1012                                                 }
1013                                                 
1014                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1015                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1016                                         }
1017                                 }
1018                         }
1019                 }
1020                 numFPrimers = primers.size();
1021                 numRPrimers = revPrimer.size();
1022
1023         }
1024         catch(exception& e) {
1025                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1026                 exit(1);
1027         }
1028 }
1029
1030 //***************************************************************************************************************
1031
1032 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1033         try {
1034                 
1035                 string rawSequence = seq.getUnaligned();
1036                 int success = bdiffs + 1;       //guilty until proven innocent
1037                 
1038                 //can you find the barcode
1039                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1040                         string oligo = it->first;
1041                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1042                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1043                                 break;  
1044                         }
1045                         
1046                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1047                                 group = it->second;
1048                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1049                                 
1050                                 if(qual.getName() != ""){
1051                                         qual.trimQScores(oligo.length(), -1);
1052                                 }
1053                                 
1054                                 success = 0;
1055                                 break;
1056                         }
1057                 }
1058                 
1059                 //if you found the barcode or if you don't want to allow for diffs
1060                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1061                 
1062                 else { //try aligning and see if you can find it
1063
1064                         int maxLength = 0;
1065
1066                         Alignment* alignment;
1067                         if (barcodes.size() > 0) {
1068                                 map<string,int>::iterator it=barcodes.begin();
1069
1070                                 for(it;it!=barcodes.end();it++){
1071                                         if(it->first.length() > maxLength){
1072                                                 maxLength = it->first.length();
1073                                         }
1074                                 }
1075                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1076
1077                         }else{ alignment = NULL; } 
1078                         
1079                         //can you find the barcode
1080                         int minDiff = 1e6;
1081                         int minCount = 1;
1082                         int minGroup = -1;
1083                         int minPos = 0;
1084                         
1085                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1086                                 string oligo = it->first;
1087 //                              int length = oligo.length();
1088                                 
1089                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1090                                         success = bdiffs + 10;
1091                                         break;
1092                                 }
1093                                 
1094                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1095                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1096                                 oligo = alignment->getSeqAAln();
1097                                 string temp = alignment->getSeqBAln();
1098                 
1099                                 int alnLength = oligo.length();
1100                                 
1101                                 for(int i=oligo.length()-1;i>=0;i--){
1102                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1103                                 }
1104                                 oligo = oligo.substr(0,alnLength);
1105                                 temp = temp.substr(0,alnLength);
1106                                 
1107                                 int numDiff = countDiffs(oligo, temp);
1108                                 
1109                                 if(numDiff < minDiff){
1110                                         minDiff = numDiff;
1111                                         minCount = 1;
1112                                         minGroup = it->second;
1113                                         minPos = 0;
1114                                         for(int i=0;i<alnLength;i++){
1115                                                 if(temp[i] != '-'){
1116                                                         minPos++;
1117                                                 }
1118                                         }
1119                                 }
1120                                 else if(numDiff == minDiff){
1121                                         minCount++;
1122                                 }
1123
1124                         }
1125
1126                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1127                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1128                         else{                                                                                                   //use the best match
1129                                 group = minGroup;
1130                                 seq.setUnaligned(rawSequence.substr(minPos));
1131                                 
1132                                 if(qual.getName() != ""){
1133                                         qual.trimQScores(minPos, -1);
1134                                 }
1135                                 success = minDiff;
1136                         }
1137                         
1138                         if (alignment != NULL) {  delete alignment;  }
1139                         
1140                 }
1141                 
1142                 return success;
1143                 
1144         }
1145         catch(exception& e) {
1146                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1147                 exit(1);
1148         }
1149
1150 }
1151
1152 //***************************************************************************************************************
1153
1154 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1155         try {
1156                 string rawSequence = seq.getUnaligned();
1157                 int success = pdiffs + 1;       //guilty until proven innocent
1158                 
1159                 //can you find the primer
1160                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1161                         string oligo = it->first;
1162                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1163                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1164                                 break;  
1165                         }
1166                         
1167                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1168                                 group = it->second;
1169                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1170                                 if(qual.getName() != ""){
1171                                         qual.trimQScores(oligo.length(), -1);
1172                                 }
1173                                 success = 0;
1174                                 break;
1175                         }
1176                 }
1177
1178                 //if you found the barcode or if you don't want to allow for diffs
1179                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1180                 
1181                 else { //try aligning and see if you can find it
1182
1183                         int maxLength = 0;
1184
1185                         Alignment* alignment;
1186                         if (primers.size() > 0) {
1187                                 map<string,int>::iterator it=primers.begin();
1188
1189                                 for(it;it!=primers.end();it++){
1190                                         if(it->first.length() > maxLength){
1191                                                 maxLength = it->first.length();
1192                                         }
1193                                 }
1194                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1195
1196                         }else{ alignment = NULL; } 
1197                         
1198                         //can you find the barcode
1199                         int minDiff = 1e6;
1200                         int minCount = 1;
1201                         int minGroup = -1;
1202                         int minPos = 0;
1203                         
1204                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1205                                 string oligo = it->first;
1206 //                              int length = oligo.length();
1207                                 
1208                                 if(rawSequence.length() < maxLength){   
1209                                         success = pdiffs + 100;
1210                                         break;
1211                                 }
1212                                 
1213                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1214                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1215                                 oligo = alignment->getSeqAAln();
1216                                 string temp = alignment->getSeqBAln();
1217                 
1218                                 int alnLength = oligo.length();
1219                                 
1220                                 for(int i=oligo.length()-1;i>=0;i--){
1221                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1222                                 }
1223                                 oligo = oligo.substr(0,alnLength);
1224                                 temp = temp.substr(0,alnLength);
1225                                 
1226                                 int numDiff = countDiffs(oligo, temp);
1227                                 
1228                                 if(numDiff < minDiff){
1229                                         minDiff = numDiff;
1230                                         minCount = 1;
1231                                         minGroup = it->second;
1232                                         minPos = 0;
1233                                         for(int i=0;i<alnLength;i++){
1234                                                 if(temp[i] != '-'){
1235                                                         minPos++;
1236                                                 }
1237                                         }
1238                                 }
1239                                 else if(numDiff == minDiff){
1240                                         minCount++;
1241                                 }
1242
1243                         }
1244
1245                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1246                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1247                         else{                                                                                                   //use the best match
1248                                 group = minGroup;
1249                                 seq.setUnaligned(rawSequence.substr(minPos));
1250                                 if(qual.getName() != ""){
1251                                         qual.trimQScores(minPos, -1);
1252                                 }
1253                                 success = minDiff;
1254                         }
1255                         
1256                         if (alignment != NULL) {  delete alignment;  }
1257                         
1258                 }
1259                 
1260                 return success;
1261
1262         }
1263         catch(exception& e) {
1264                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1265                 exit(1);
1266         }
1267 }
1268
1269 //***************************************************************************************************************
1270
1271 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1272         try {
1273                 string rawSequence = seq.getUnaligned();
1274                 bool success = 0;       //guilty until proven innocent
1275                 
1276                 for(int i=0;i<numRPrimers;i++){
1277                         string oligo = revPrimer[i];
1278                         
1279                         if(rawSequence.length() < oligo.length()){
1280                                 success = 0;
1281                                 break;
1282                         }
1283                         
1284                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1285                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1286                                 if(qual.getName() != ""){
1287                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1288                                 }
1289                                 success = 1;
1290                                 break;
1291                         }
1292                 }       
1293                 return success;
1294                 
1295         }
1296         catch(exception& e) {
1297                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1298                 exit(1);
1299         }
1300 }
1301
1302 //***************************************************************************************************************
1303
1304 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1305         try {
1306                 bool success = 1;
1307                 if(qscores.getName() != ""){
1308                         qscores.trimQScores(-1, keepFirst);
1309                 }
1310                 sequence.trim(keepFirst);
1311                 return success;
1312         }
1313         catch(exception& e) {
1314                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1315                 exit(1);
1316         }
1317         
1318 }       
1319
1320 //***************************************************************************************************************
1321
1322 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1323         try {
1324                 bool success = 0;
1325                 
1326                 int length = sequence.getNumBases() - removeLast;
1327                 
1328                 if(length > 0){
1329                         if(qscores.getName() != ""){
1330                                 qscores.trimQScores(-1, length);
1331                         }
1332                         sequence.trim(length);
1333                         success = 1;
1334                 }
1335                 else{
1336                         success = 0;
1337                 }
1338
1339                 return success;
1340         }
1341         catch(exception& e) {
1342                 m->errorOut(e, "removeLastTrim", "countDiffs");
1343                 exit(1);
1344         }
1345         
1346 }       
1347
1348 //***************************************************************************************************************
1349
1350 bool TrimSeqsCommand::cullLength(Sequence& seq){
1351         try {
1352         
1353                 int length = seq.getNumBases();
1354                 bool success = 0;       //guilty until proven innocent
1355                 
1356                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1357                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1358                 else                                                                                            {       success = 0;    }
1359                 
1360                 return success;
1361         
1362         }
1363         catch(exception& e) {
1364                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1365                 exit(1);
1366         }
1367         
1368 }
1369
1370 //***************************************************************************************************************
1371
1372 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1373         try {
1374                 int longHomoP = seq.getLongHomoPolymer();
1375                 bool success = 0;       //guilty until proven innocent
1376                 
1377                 if(longHomoP <= maxHomoP){      success = 1;    }
1378                 else                                    {       success = 0;    }
1379                 
1380                 return success;
1381         }
1382         catch(exception& e) {
1383                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1384                 exit(1);
1385         }
1386         
1387 }
1388
1389 //***************************************************************************************************************
1390
1391 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1392         try {
1393                 int numNs = seq.getAmbigBases();
1394                 bool success = 0;       //guilty until proven innocent
1395                 
1396                 if(numNs <= maxAmbig)   {       success = 1;    }
1397                 else                                    {       success = 0;    }
1398                 
1399                 return success;
1400         }
1401         catch(exception& e) {
1402                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1403                 exit(1);
1404         }
1405         
1406 }
1407
1408 //***************************************************************************************************************
1409
1410 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1411         try {
1412                 bool success = 1;
1413                 int length = oligo.length();
1414                 
1415                 for(int i=0;i<length;i++){
1416                         
1417                         if(oligo[i] != seq[i]){
1418                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1419                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1420                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1421                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1422                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1423                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1424                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1425                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1426                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1427                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1428                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1429                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1430                                 
1431                                 if(success == 0)        {       break;   }
1432                         }
1433                         else{
1434                                 success = 1;
1435                         }
1436                 }
1437                 
1438                 return success;
1439         }
1440         catch(exception& e) {
1441                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1442                 exit(1);
1443         }
1444
1445 }
1446
1447 //***************************************************************************************************************
1448
1449 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1450         try {
1451
1452                 int length = oligo.length();
1453                 int countDiffs = 0;
1454                 
1455                 for(int i=0;i<length;i++){
1456                                                                 
1457                         if(oligo[i] != seq[i]){
1458                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1459                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1460                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1461                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1462                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1463                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1464                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1465                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1466                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1467                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1468                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1469                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1470                         }
1471                         
1472                 }
1473                 
1474                 return countDiffs;
1475         }
1476         catch(exception& e) {
1477                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1478                 exit(1);
1479         }
1480
1481 }
1482
1483 //***************************************************************************************************************