]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
fixed problem with sffinfo that caused a file mismatch when the trimming trimmed...
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimSeqsCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", 
18                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19                                                                         "keepfirst", "removelast",
20                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29
30 //**********************************************************************************************************************
31
32 TrimSeqsCommand::TrimSeqsCommand(){     
33         try {
34                 abort = true; calledHelp = true; 
35                 vector<string> tempOutNames;
36                 outputTypes["fasta"] = tempOutNames;
37                 outputTypes["qfile"] = tempOutNames;
38                 outputTypes["group"] = tempOutNames;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
42                 exit(1);
43         }
44 }
45
46 //**********************************************************************************************************************
47
48 vector<string> TrimSeqsCommand::getRequiredParameters(){        
49         try {
50                 string Array[] =  {"fasta"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59
60 //**********************************************************************************************************************
61
62 vector<string> TrimSeqsCommand::getRequiredFiles(){     
63         try {
64                 vector<string> myArray;
65                 return myArray;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
69                 exit(1);
70         }
71 }
72
73 //***************************************************************************************************************
74
75 TrimSeqsCommand::TrimSeqsCommand(string option)  {
76         try {
77                 
78                 abort = false; calledHelp = false;   
79                 comboStarts = 0;
80                 
81                 //allow user to run help
82                 if(option == "help") { help(); abort = true; calledHelp = true; }
83                 
84                 else {
85                         //valid paramters for this command
86                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
87                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88                                                                 "keepfirst", "removelast",
89                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
90                         
91                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
92                         
93                         OptionParser parser(option);
94                         map<string,string> parameters = parser.getParameters();
95                         
96                         ValidParameters validParameter;
97                         map<string,string>::iterator it;
98                         
99                         //check to make sure all parameters are valid for command
100                         for (it = parameters.begin(); it != parameters.end(); it++) { 
101                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
102                         }
103                         
104                         //initialize outputTypes
105                         vector<string> tempOutNames;
106                         outputTypes["fasta"] = tempOutNames;
107                         outputTypes["qfile"] = tempOutNames;
108                         outputTypes["group"] = tempOutNames;
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fasta");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("oligos");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
129                                 }
130                                 
131                                 it = parameters.find("qfile");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
137                                 }
138                                 
139                         }
140
141                         
142                         //check for required parameters
143                         fastaFile = validParameter.validFile(parameters, "fasta", true);
144                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145                         else if (fastaFile == "not open") { abort = true; }     
146                         
147                         //if the user changes the output directory command factory will send this info to us in the output parameter 
148                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
149                                 outputDir = ""; 
150                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
151                         }
152                 
153                         
154                         //check for optional parameter and set defaults
155                         // ...at some point should added some additional type checking...
156                         string temp;
157                         temp = validParameter.validFile(parameters, "flip", false);
158                         if (temp == "not found"){       flip = 0;       }
159                         else if(m->isTrue(temp))        {       flip = 1;       }
160                 
161                         temp = validParameter.validFile(parameters, "oligos", true);
162                         if (temp == "not found"){       oligoFile = "";         }
163                         else if(temp == "not open"){    abort = true;   } 
164                         else                                    {       oligoFile = temp;               }
165                         
166                         
167                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
168                         convert(temp, maxAmbig);  
169
170                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
171                         convert(temp, maxHomoP);  
172
173                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
174                         convert(temp, minLength); 
175                         
176                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
177                         convert(temp, maxLength);
178                         
179                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
180                         convert(temp, bdiffs);
181                         
182                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
183                         convert(temp, pdiffs);
184                         
185                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
186                         convert(temp, tdiffs);
187                         
188                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
189                         
190                         temp = validParameter.validFile(parameters, "qfile", true);     
191                         if (temp == "not found")        {       qFileName = "";         }
192                         else if(temp == "not open")     {       abort = true;           }
193                         else                                            {       qFileName = temp;       }
194                         
195                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
196                         convert(temp, qThreshold);
197                         
198                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
199                         qtrim = m->isTrue(temp);
200
201                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
202                         convert(temp, qRollAverage);
203
204                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205                         convert(temp, qWindowAverage);
206
207                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
208                         convert(temp, qWindowSize);
209
210                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
211                         convert(temp, qWindowStep);
212
213                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
214                         convert(temp, qAverage);
215
216                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
217                         convert(temp, keepFirst);
218
219                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
220                         convert(temp, removeLast);
221                         
222                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
223                         allFiles = m->isTrue(temp);
224                         
225                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
226                         convert(temp, processors); 
227                         
228                         
229                         if(allFiles && (oligoFile == "")){
230                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
231                         }
232                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
234                                 qAverage=0;
235                                 qThreshold=0;
236                         }
237                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
238                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
239                                 abort = true;
240                         }
241                 }
242
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
246                 exit(1);
247         }
248 }
249
250 //**********************************************************************************************************************
251
252 void TrimSeqsCommand::help(){
253         try {
254                 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255                 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257                 m->mothurOut("The fasta parameter is required.\n");
258                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259                 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260                 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261                 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262                 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263                 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267                 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268                 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269                 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270                 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271                 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272                 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273                 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274                 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275                 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276                 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277                 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278                 m->mothurOut("The trim.seqs command should be in the following format: \n");
279                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
280                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
281                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
284
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "TrimSeqsCommand", "help");
288                 exit(1);
289         }
290 }
291
292
293 //***************************************************************************************************************
294
295 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
296
297 //***************************************************************************************************************
298
299 int TrimSeqsCommand::execute(){
300         try{
301         
302                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
303                 
304                 numFPrimers = 0;  //this needs to be initialized
305                 numRPrimers = 0;
306                 vector<vector<string> > fastaFileNames;
307                 vector<vector<string> > qualFileNames;
308                 
309                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
311                 
312                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
314                 
315                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317                 if (qFileName != "") {
318                         outputNames.push_back(trimQualFile);
319                         outputNames.push_back(scrapQualFile);
320                         outputTypes["qfile"].push_back(trimQualFile);
321                         outputTypes["qfile"].push_back(scrapQualFile); 
322                 }
323                 
324                 string outputGroupFileName;
325                 if(oligoFile != ""){
326                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328                         getOligos(fastaFileNames, qualFileNames);
329                 }
330
331                 vector<unsigned long int> fastaFilePos;
332                 vector<unsigned long int> qFilePos;
333                 
334                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
335                 
336                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
339                 }       
340                 if(qFileName == "")     {       qLines = lines; } //files with duds
341                 
342                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
343                                 if(processors == 1){
344                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
345                                 }else{
346                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
347                                 }       
348                 #else
349                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
350                 #endif
351                 
352                 if (m->control_pressed) {  return 0; }                  
353                         
354                 if(allFiles){
355                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
356                         map<string, string>::iterator it;
357                         set<string> namesToRemove;
358                         for(int i=0;i<fastaFileNames.size();i++){
359                                 for(int j=0;j<fastaFileNames[0].size();j++){
360                                         if (fastaFileNames[i][j] != "") {
361                                                 if(m->isBlank(fastaFileNames[i][j])){
362                                                         remove(fastaFileNames[i][j].c_str());
363                                                         namesToRemove.insert(fastaFileNames[i][j]);
364                                                         
365                                                         if(qFileName != ""){
366                                                                 remove(qualFileNames[i][j].c_str());
367                                                                 namesToRemove.insert(qualFileNames[i][j]);
368                                                         }
369                                                 }else{  
370                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
371                                                         if (it == uniqueFastaNames.end()) {     
372                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
373                                                         }       
374                                                 }
375                                         }
376                                 }
377                         }
378                         
379                         //remove names for outputFileNames, just cleans up the output
380                         vector<string> outputNames2;
381                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
382                         outputNames = outputNames2;
383                         
384                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
385                                 ifstream in;
386                                 m->openInputFile(it->first, in);
387                                 
388                                 ofstream out;
389                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
390                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
391                                 m->openOutputFile(thisGroupName, out);
392                                 
393                                 while (!in.eof()){
394                                         if (m->control_pressed) { break; }
395                                         
396                                         Sequence currSeq(in); m->gobble(in);
397                                         out << currSeq.getName() << '\t' << it->second << endl;
398                                 }
399                                 in.close();
400                                 out.close();
401                         }
402                 }
403                 
404                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
405
406                 //output group counts
407                 m->mothurOutEndLine();
408                 int total = 0;
409                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
410                          total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); 
411                 }
412                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
413                 
414                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
415
416                 //set fasta file as new current fastafile
417                 string current = "";
418                 itTypes = outputTypes.find("fasta");
419                 if (itTypes != outputTypes.end()) {
420                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
421                 }
422                 
423                 itTypes = outputTypes.find("qfile");
424                 if (itTypes != outputTypes.end()) {
425                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
426                 }
427                 
428                 itTypes = outputTypes.find("group");
429                 if (itTypes != outputTypes.end()) {
430                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
431                 }
432
433                 m->mothurOutEndLine();
434                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
435                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
436                 m->mothurOutEndLine();
437                 
438                 return 0;       
439                         
440         }
441         catch(exception& e) {
442                 m->errorOut(e, "TrimSeqsCommand", "execute");
443                 exit(1);
444         }
445 }
446                 
447 /**************************************************************************************/
448
449 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
450                 
451         try {
452                 
453                 ofstream trimFASTAFile;
454                 m->openOutputFile(trimFileName, trimFASTAFile);
455                 
456                 ofstream scrapFASTAFile;
457                 m->openOutputFile(scrapFileName, scrapFASTAFile);
458                 
459                 ofstream trimQualFile;
460                 ofstream scrapQualFile;
461                 if(qFileName != ""){
462                         m->openOutputFile(trimQFileName, trimQualFile);
463                         m->openOutputFile(scrapQFileName, scrapQualFile);
464                 }
465                 
466                 ofstream outGroupsFile;
467                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
468                 if(allFiles){
469                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
470                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
471                                         if (fastaFileNames[i][j] != "") {
472                                                 ofstream temp;
473                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
474                                                 if(qFileName != ""){
475                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
476                                                 }
477                                         }
478                                 }
479                         }
480                 }
481                 
482                 ifstream inFASTA;
483                 m->openInputFile(filename, inFASTA);
484                 inFASTA.seekg(line->start);
485                 
486                 ifstream qFile;
487                 if(qFileName != "")     {
488                         m->openInputFile(qFileName, qFile);
489                         qFile.seekg(qline->start);  
490                 }
491                 
492                 int count = 0;
493                 bool moreSeqs = 1;
494         
495                 while (moreSeqs) {
496                                 
497                         if (m->control_pressed) { 
498                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
499                                 if (oligoFile != "") {   outGroupsFile.close();   }
500
501                                 if(qFileName != ""){
502                                         qFile.close();
503                                 }
504                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
505
506                                 return 0;
507                         }
508                         
509                         int success = 1;
510                         string trashCode = "";
511                         int currentSeqsDiffs = 0;
512
513                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
514
515                         QualityScores currQual;
516                         if(qFileName != ""){
517                                 currQual = QualityScores(qFile);  m->gobble(qFile);
518                         }
519
520                         string origSeq = currSeq.getUnaligned();
521                         if (origSeq != "") {
522                                 
523                                 int barcodeIndex = 0;
524                                 int primerIndex = 0;
525                                 
526                                 if(barcodes.size() != 0){
527                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
528                                         if(success > bdiffs)            {       trashCode += 'b';       }
529                                         else{ currentSeqsDiffs += success;  }
530                                 }
531                                 
532                                 if(numFPrimers != 0){
533                                         success = stripForward(currSeq, currQual, primerIndex);
534                                         if(success > pdiffs)            {       trashCode += 'f';       }
535                                         else{ currentSeqsDiffs += success;  }
536                                 }
537                                 
538                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
539                                 
540                                 if(numRPrimers != 0){
541                                         success = stripReverse(currSeq, currQual);
542                                         if(!success)                            {       trashCode += 'r';       }
543                                 }
544
545                                 if(keepFirst != 0){
546                                         success = keepFirstTrim(currSeq, currQual);
547                                 }
548                                 
549                                 if(removeLast != 0){
550                                         success = removeLastTrim(currSeq, currQual);
551                                         if(!success)                            {       trashCode += 'l';       }
552                                 }
553
554                                 
555                                 if(qFileName != ""){
556                                         int origLength = currSeq.getNumBases();
557                                         
558                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
559                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
560                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
561                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
562                                         else                                            {       success = 1;                            }
563                                         
564                                         //you don't want to trim, if it fails above then scrap it
565                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
566                                         
567                                         if(!success)                            {       trashCode += 'q';       }
568                                 }                               
569                 
570                                 if(minLength > 0 || maxLength > 0){
571                                         success = cullLength(currSeq);
572                                         if(!success)                            {       trashCode += 'l';       }
573                                 }
574                                 if(maxHomoP > 0){
575                                         success = cullHomoP(currSeq);
576                                         if(!success)                            {       trashCode += 'h';       }
577                                 }
578                                 if(maxAmbig != -1){
579                                         success = cullAmbigs(currSeq);
580                                         if(!success)                            {       trashCode += 'n';       }
581                                 }
582                                 
583                                 if(flip){               // should go last                       
584                                         currSeq.reverseComplement();
585                                         if(qFileName != ""){
586                                                 currQual.flipQScores(); 
587                                         }
588                                 }
589                                 
590                                 if(trashCode.length() == 0){
591                                         currSeq.setAligned(currSeq.getUnaligned());
592                                         currSeq.printSequence(trimFASTAFile);
593                                         
594                                         if(qFileName != ""){
595                                                 currQual.printQScores(trimQualFile);
596                                         }
597                                         
598                                         if(barcodes.size() != 0){
599                                                 string thisGroup = barcodeNameVector[barcodeIndex];
600                                                 if (primers.size() != 0) {  if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
601                                                 
602                                                 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
603                                                 
604                                                 map<string, int>::iterator it = groupCounts.find(thisGroup);
605                                                 if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
606                                                 else { groupCounts[it->first]++; }
607                                                         
608                                         }
609                                         
610                                         
611                                         if(allFiles){
612                                                 ofstream output;
613                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
614                                                 currSeq.printSequence(output);
615                                                 output.close();
616                                                 
617                                                 if(qFileName != ""){
618                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
619                                                         currQual.printQScores(output);
620                                                         output.close();                                                 
621                                                 }
622                                         }
623                                 }
624                                 else{
625                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
626                                         currSeq.setUnaligned(origSeq);
627                                         currSeq.setAligned(origSeq);
628                                         currSeq.printSequence(scrapFASTAFile);
629                                         if(qFileName != ""){
630                                                 currQual.printQScores(scrapQualFile);
631                                         }
632                                 }
633                                 count++;
634                         }
635                         
636                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
637                                 unsigned long int pos = inFASTA.tellg();
638                                 if ((pos == -1) || (pos >= line->end)) { break; }
639                         #else
640                                 if (inFASTA.eof()) { break; }
641                         #endif
642                                 
643                         //report progress
644                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
645                         
646                 }
647                 //report progress
648                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
649
650                 
651                 inFASTA.close();
652                 trimFASTAFile.close();
653                 scrapFASTAFile.close();
654                 if (oligoFile != "") {   outGroupsFile.close();   }
655                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
656                 
657                 return count;
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
661                 exit(1);
662         }
663 }
664
665 /**************************************************************************************************/
666
667 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
668         try {
669 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
670                 int process = 1;
671                 int exitCommand = 1;
672                 processIDS.clear();
673                 
674                 //loop through and create all the processes you want
675                 while (process != processors) {
676                         int pid = fork();
677                         
678                         if (pid > 0) {
679                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
680                                 process++;
681                         }else if (pid == 0){
682                                 
683                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
684                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
685
686                                 if(allFiles){
687                                         ofstream temp;
688
689                                         for(int i=0;i<tempFASTAFileNames.size();i++){
690                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
691                                                         if (tempFASTAFileNames[i][j] != "") {
692                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
693                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
694
695                                                                 if(qFileName != ""){
696                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
697                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
698                                                                 }
699                                                         }
700                                                 }
701                                         }
702                                 }
703                                                         
704                                 driverCreateTrim(filename,
705                                                                  qFileName,
706                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
707                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
708                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
709                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
710                                                                  (groupFile + toString(getpid()) + ".temp"),
711                                                                  tempFASTAFileNames,
712                                                                  tempPrimerQualFileNames,
713                                                                  lines[process],
714                                                                  qLines[process]);
715                                 
716                                 //pass groupCounts to parent
717                                 ofstream out;
718                                 string tempFile = filename + toString(getpid()) + ".num.temp";
719                                 m->openOutputFile(tempFile, out);
720                                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
721                                         out << it->first << '\t' << it->second << endl;
722                                 }
723                                 out.close();
724                                 
725                                 exit(0);
726                         }else { 
727                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
728                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
729                                 exit(0);
730                         }
731                 }
732                 
733                 //parent do my part
734                 ofstream temp;
735                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
736                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
737                 m->openOutputFile(trimQualFileName, temp);              temp.close();
738                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
739
740                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
741                 
742                 //force parent to wait until all the processes are done
743                 for (int i=0;i<processIDS.size();i++) { 
744                         int temp = processIDS[i];
745                         wait(&temp);
746                 }
747                 
748                 //append files
749                 for(int i=0;i<processIDS.size();i++){
750                         
751                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
752                         
753                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
754                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
755                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
756                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
757                         
758                         if(qFileName != ""){
759                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
760                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
761                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
762                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
763                         }
764                         
765                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
766                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
767                         
768                         
769                         if(allFiles){
770                                 for(int j=0;j<fastaFileNames.size();j++){
771                                         for(int k=0;k<fastaFileNames[j].size();k++){
772                                                 if (fastaFileNames[j][k] != "") {
773                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
774                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
775                                                         
776                                                         if(qFileName != ""){
777                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
778                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
779                                                         }
780                                                 }
781                                         }
782                                 }
783                         }
784                         
785                         ifstream in;
786                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
787                         m->openInputFile(tempFile, in);
788                         int tempNum;
789                         string group;
790                         while (!in.eof()) { 
791                                 in >> group >> tempNum; m->gobble(in);
792                                 
793                                 map<string, int>::iterator it = groupCounts.find(group);
794                                 if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
795                                 else { groupCounts[it->first] += tempNum; }
796                         }
797                         in.close(); remove(tempFile.c_str());
798                         
799                 }
800         
801                 return exitCommand;
802 #endif          
803         }
804         catch(exception& e) {
805                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
806                 exit(1);
807         }
808 }
809
810 /**************************************************************************************************/
811
812 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
813         try {
814                 
815                 //set file positions for fasta file
816                 fastaFilePos = m->divideFile(filename, processors);
817                 
818                 if (qfilename == "") { return processors; }
819                 
820                 //get name of first sequence in each chunk
821                 map<string, int> firstSeqNames;
822                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
823                         ifstream in;
824                         m->openInputFile(filename, in);
825                         in.seekg(fastaFilePos[i]);
826                 
827                         Sequence temp(in); 
828                         firstSeqNames[temp.getName()] = i;
829                 
830                         in.close();
831                 }
832                                 
833                 //seach for filePos of each first name in the qfile and save in qfileFilePos
834                 ifstream inQual;
835                 m->openInputFile(qfilename, inQual);
836                 
837                 string input;
838                 while(!inQual.eof()){   
839                         input = m->getline(inQual);
840
841                         if (input.length() != 0) {
842                                 if(input[0] == '>'){ //this is a sequence name line
843                                         istringstream nameStream(input);
844                                         
845                                         string sname = "";  nameStream >> sname;
846                                         sname = sname.substr(1);
847                                         
848                                         map<string, int>::iterator it = firstSeqNames.find(sname);
849                                         
850                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
851                                                 unsigned long int pos = inQual.tellg(); 
852                                                 qfileFilePos.push_back(pos - input.length() - 1);       
853                                                 firstSeqNames.erase(it);
854                                         }
855                                 }
856                         }
857                         
858                         if (firstSeqNames.size() == 0) { break; }
859                 }
860                 inQual.close();
861                 
862                 
863                 if (firstSeqNames.size() != 0) { 
864                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
865                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
866                         }
867                         qFileName = "";
868                         return processors;
869                 }
870
871                 //get last file position of qfile
872                 FILE * pFile;
873                 unsigned long int size;
874                 
875                 //get num bytes in file
876                 pFile = fopen (qfilename.c_str(),"rb");
877                 if (pFile==NULL) perror ("Error opening file");
878                 else{
879                         fseek (pFile, 0, SEEK_END);
880                         size=ftell (pFile);
881                         fclose (pFile);
882                 }
883                 
884                 qfileFilePos.push_back(size);
885                 
886                 return processors;
887         }
888         catch(exception& e) {
889                 m->errorOut(e, "TrimSeqsCommand", "setLines");
890                 exit(1);
891         }
892 }
893
894 //***************************************************************************************************************
895
896 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
897         try {
898                 ifstream inOligos;
899                 m->openInputFile(oligoFile, inOligos);
900                 
901                 ofstream test;
902                 
903                 string type, oligo, group;
904
905                 int indexPrimer = 0;
906                 int indexBarcode = 0;
907                 
908                 while(!inOligos.eof()){
909
910                         inOligos >> type; m->gobble(inOligos);
911                                         
912                         if(type[0] == '#'){
913                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
914                         }
915                         else{
916                                 //make type case insensitive
917                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
918                                 
919                                 inOligos >> oligo;
920                                 
921                                 for(int i=0;i<oligo.length();i++){
922                                         oligo[i] = toupper(oligo[i]);
923                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
924                                 }
925                                 
926                                 if(type == "FORWARD"){
927                                         group = "";
928                                         
929                                         // get rest of line in case there is a primer name
930                                         while (!inOligos.eof()) {       
931                                                 char c = inOligos.get(); 
932                                                 if (c == 10 || c == 13){        break;  }
933                                                 else if (c == 32 || c == 9){;} //space or tab
934                                                 else {  group += c;  }
935                                         } 
936                                         
937                                         //check for repeat barcodes
938                                         map<string, int>::iterator itPrime = primers.find(oligo);
939                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
940                                         
941                                         primers[oligo]=indexPrimer; indexPrimer++;              
942                                         primerNameVector.push_back(group);
943                                 }
944                                 else if(type == "REVERSE"){
945                                         Sequence oligoRC("reverse", oligo);
946                                         oligoRC.reverseComplement();
947                                         revPrimer.push_back(oligoRC.getUnaligned());
948                                 }
949                                 else if(type == "BARCODE"){
950                                         inOligos >> group;
951                                         
952                                         //check for repeat barcodes
953                                         map<string, int>::iterator itBar = barcodes.find(oligo);
954                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
955                                                 
956                                         barcodes[oligo]=indexBarcode; indexBarcode++;
957                                         barcodeNameVector.push_back(group);
958                                 }
959                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
960                         }
961                         m->gobble(inOligos);
962                 }       
963                 inOligos.close();
964                 
965                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
966                 
967                 //add in potential combos
968                 if(barcodeNameVector.size() == 0){
969                         barcodes[""] = 0;
970                         barcodeNameVector.push_back("");                        
971                 }
972                 
973                 if(primerNameVector.size() == 0){
974                         primers[""] = 0;
975                         primerNameVector.push_back("");                 
976                 }
977                 
978                 fastaFileNames.resize(barcodeNameVector.size());
979                 for(int i=0;i<fastaFileNames.size();i++){
980                         fastaFileNames[i].assign(primerNameVector.size(), "");
981                 }
982                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
983                 
984                 if(allFiles){
985                         set<string> uniqueNames; //used to cleanup outputFileNames
986                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
987                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
988                                         
989                                         string primerName = primerNameVector[itPrimer->second];
990                                         string barcodeName = barcodeNameVector[itBar->second];
991                                         
992                                         string comboGroupName = "";
993                                         string fastaFileName = "";
994                                         string qualFileName = "";
995                                         
996                                         if(primerName == ""){
997                                                 comboGroupName = barcodeNameVector[itBar->second];
998                                         }
999                                         else{
1000                                                 if(barcodeName == ""){
1001                                                         comboGroupName = primerNameVector[itPrimer->second];
1002                                                 }
1003                                                 else{
1004                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1005                                                 }
1006                                         }
1007
1008                                         ofstream temp;
1009                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1010                                         if (uniqueNames.count(fastaFileName) == 0) {
1011                                                 outputNames.push_back(fastaFileName);
1012                                                 outputTypes["fasta"].push_back(fastaFileName);
1013                                                 uniqueNames.insert(fastaFileName);
1014                                         }
1015                                         
1016                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1017                                         m->openOutputFile(fastaFileName, temp);         temp.close();
1018
1019                                         if(qFileName != ""){
1020                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1021                                                 if (uniqueNames.count(fastaFileName) == 0) {
1022                                                         outputNames.push_back(qualFileName);
1023                                                         outputTypes["qfile"].push_back(qualFileName);
1024                                                 }
1025                                                 
1026                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1027                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1028                                         }
1029                                 }
1030                         }
1031                 }
1032                 numFPrimers = primers.size();
1033                 numRPrimers = revPrimer.size();
1034
1035         }
1036         catch(exception& e) {
1037                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1038                 exit(1);
1039         }
1040 }
1041
1042 //***************************************************************************************************************
1043
1044 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1045         try {
1046                 
1047                 string rawSequence = seq.getUnaligned();
1048                 int success = bdiffs + 1;       //guilty until proven innocent
1049                 
1050                 //can you find the barcode
1051                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1052                         string oligo = it->first;
1053                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1054                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1055                                 break;  
1056                         }
1057                         
1058                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1059                                 group = it->second;
1060                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1061                                 
1062                                 if(qual.getName() != ""){
1063                                         qual.trimQScores(oligo.length(), -1);
1064                                 }
1065                                 
1066                                 success = 0;
1067                                 break;
1068                         }
1069                 }
1070                 
1071                 //if you found the barcode or if you don't want to allow for diffs
1072                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1073                 
1074                 else { //try aligning and see if you can find it
1075
1076                         int maxLength = 0;
1077
1078                         Alignment* alignment;
1079                         if (barcodes.size() > 0) {
1080                                 map<string,int>::iterator it=barcodes.begin();
1081
1082                                 for(it;it!=barcodes.end();it++){
1083                                         if(it->first.length() > maxLength){
1084                                                 maxLength = it->first.length();
1085                                         }
1086                                 }
1087                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1088
1089                         }else{ alignment = NULL; } 
1090                         
1091                         //can you find the barcode
1092                         int minDiff = 1e6;
1093                         int minCount = 1;
1094                         int minGroup = -1;
1095                         int minPos = 0;
1096                         
1097                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1098                                 string oligo = it->first;
1099 //                              int length = oligo.length();
1100                                 
1101                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1102                                         success = bdiffs + 10;
1103                                         break;
1104                                 }
1105                                 
1106                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1107                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1108                                 oligo = alignment->getSeqAAln();
1109                                 string temp = alignment->getSeqBAln();
1110                 
1111                                 int alnLength = oligo.length();
1112                                 
1113                                 for(int i=oligo.length()-1;i>=0;i--){
1114                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1115                                 }
1116                                 oligo = oligo.substr(0,alnLength);
1117                                 temp = temp.substr(0,alnLength);
1118                                 
1119                                 int numDiff = countDiffs(oligo, temp);
1120                                 
1121                                 if(numDiff < minDiff){
1122                                         minDiff = numDiff;
1123                                         minCount = 1;
1124                                         minGroup = it->second;
1125                                         minPos = 0;
1126                                         for(int i=0;i<alnLength;i++){
1127                                                 if(temp[i] != '-'){
1128                                                         minPos++;
1129                                                 }
1130                                         }
1131                                 }
1132                                 else if(numDiff == minDiff){
1133                                         minCount++;
1134                                 }
1135
1136                         }
1137
1138                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1139                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1140                         else{                                                                                                   //use the best match
1141                                 group = minGroup;
1142                                 seq.setUnaligned(rawSequence.substr(minPos));
1143                                 
1144                                 if(qual.getName() != ""){
1145                                         qual.trimQScores(minPos, -1);
1146                                 }
1147                                 success = minDiff;
1148                         }
1149                         
1150                         if (alignment != NULL) {  delete alignment;  }
1151                         
1152                 }
1153                 
1154                 return success;
1155                 
1156         }
1157         catch(exception& e) {
1158                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1159                 exit(1);
1160         }
1161
1162 }
1163
1164 //***************************************************************************************************************
1165
1166 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1167         try {
1168                 string rawSequence = seq.getUnaligned();
1169                 int success = pdiffs + 1;       //guilty until proven innocent
1170                 
1171                 //can you find the primer
1172                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1173                         string oligo = it->first;
1174                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1175                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1176                                 break;  
1177                         }
1178                         
1179                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1180                                 group = it->second;
1181                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1182                                 if(qual.getName() != ""){
1183                                         qual.trimQScores(oligo.length(), -1);
1184                                 }
1185                                 success = 0;
1186                                 break;
1187                         }
1188                 }
1189
1190                 //if you found the barcode or if you don't want to allow for diffs
1191                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1192                 
1193                 else { //try aligning and see if you can find it
1194
1195                         int maxLength = 0;
1196
1197                         Alignment* alignment;
1198                         if (primers.size() > 0) {
1199                                 map<string,int>::iterator it=primers.begin();
1200
1201                                 for(it;it!=primers.end();it++){
1202                                         if(it->first.length() > maxLength){
1203                                                 maxLength = it->first.length();
1204                                         }
1205                                 }
1206                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1207
1208                         }else{ alignment = NULL; } 
1209                         
1210                         //can you find the barcode
1211                         int minDiff = 1e6;
1212                         int minCount = 1;
1213                         int minGroup = -1;
1214                         int minPos = 0;
1215                         
1216                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1217                                 string oligo = it->first;
1218 //                              int length = oligo.length();
1219                                 
1220                                 if(rawSequence.length() < maxLength){   
1221                                         success = pdiffs + 100;
1222                                         break;
1223                                 }
1224                                 
1225                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1226                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1227                                 oligo = alignment->getSeqAAln();
1228                                 string temp = alignment->getSeqBAln();
1229                 
1230                                 int alnLength = oligo.length();
1231                                 
1232                                 for(int i=oligo.length()-1;i>=0;i--){
1233                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1234                                 }
1235                                 oligo = oligo.substr(0,alnLength);
1236                                 temp = temp.substr(0,alnLength);
1237                                 
1238                                 int numDiff = countDiffs(oligo, temp);
1239                                 
1240                                 if(numDiff < minDiff){
1241                                         minDiff = numDiff;
1242                                         minCount = 1;
1243                                         minGroup = it->second;
1244                                         minPos = 0;
1245                                         for(int i=0;i<alnLength;i++){
1246                                                 if(temp[i] != '-'){
1247                                                         minPos++;
1248                                                 }
1249                                         }
1250                                 }
1251                                 else if(numDiff == minDiff){
1252                                         minCount++;
1253                                 }
1254
1255                         }
1256
1257                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1258                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1259                         else{                                                                                                   //use the best match
1260                                 group = minGroup;
1261                                 seq.setUnaligned(rawSequence.substr(minPos));
1262                                 if(qual.getName() != ""){
1263                                         qual.trimQScores(minPos, -1);
1264                                 }
1265                                 success = minDiff;
1266                         }
1267                         
1268                         if (alignment != NULL) {  delete alignment;  }
1269                         
1270                 }
1271                 
1272                 return success;
1273
1274         }
1275         catch(exception& e) {
1276                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1277                 exit(1);
1278         }
1279 }
1280
1281 //***************************************************************************************************************
1282
1283 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1284         try {
1285                 string rawSequence = seq.getUnaligned();
1286                 bool success = 0;       //guilty until proven innocent
1287                 
1288                 for(int i=0;i<numRPrimers;i++){
1289                         string oligo = revPrimer[i];
1290                         
1291                         if(rawSequence.length() < oligo.length()){
1292                                 success = 0;
1293                                 break;
1294                         }
1295                         
1296                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1297                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1298                                 if(qual.getName() != ""){
1299                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1300                                 }
1301                                 success = 1;
1302                                 break;
1303                         }
1304                 }       
1305                 return success;
1306                 
1307         }
1308         catch(exception& e) {
1309                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1310                 exit(1);
1311         }
1312 }
1313
1314 //***************************************************************************************************************
1315
1316 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1317         try {
1318                 bool success = 1;
1319                 if(qscores.getName() != ""){
1320                         qscores.trimQScores(-1, keepFirst);
1321                 }
1322                 sequence.trim(keepFirst);
1323                 return success;
1324         }
1325         catch(exception& e) {
1326                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1327                 exit(1);
1328         }
1329         
1330 }       
1331
1332 //***************************************************************************************************************
1333
1334 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1335         try {
1336                 bool success = 0;
1337                 
1338                 int length = sequence.getNumBases() - removeLast;
1339                 
1340                 if(length > 0){
1341                         if(qscores.getName() != ""){
1342                                 qscores.trimQScores(-1, length);
1343                         }
1344                         sequence.trim(length);
1345                         success = 1;
1346                 }
1347                 else{
1348                         success = 0;
1349                 }
1350
1351                 return success;
1352         }
1353         catch(exception& e) {
1354                 m->errorOut(e, "removeLastTrim", "countDiffs");
1355                 exit(1);
1356         }
1357         
1358 }       
1359
1360 //***************************************************************************************************************
1361
1362 bool TrimSeqsCommand::cullLength(Sequence& seq){
1363         try {
1364         
1365                 int length = seq.getNumBases();
1366                 bool success = 0;       //guilty until proven innocent
1367                 
1368                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1369                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1370                 else                                                                                            {       success = 0;    }
1371                 
1372                 return success;
1373         
1374         }
1375         catch(exception& e) {
1376                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1377                 exit(1);
1378         }
1379         
1380 }
1381
1382 //***************************************************************************************************************
1383
1384 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1385         try {
1386                 int longHomoP = seq.getLongHomoPolymer();
1387                 bool success = 0;       //guilty until proven innocent
1388                 
1389                 if(longHomoP <= maxHomoP){      success = 1;    }
1390                 else                                    {       success = 0;    }
1391                 
1392                 return success;
1393         }
1394         catch(exception& e) {
1395                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1396                 exit(1);
1397         }
1398         
1399 }
1400
1401 //***************************************************************************************************************
1402
1403 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1404         try {
1405                 int numNs = seq.getAmbigBases();
1406                 bool success = 0;       //guilty until proven innocent
1407                 
1408                 if(numNs <= maxAmbig)   {       success = 1;    }
1409                 else                                    {       success = 0;    }
1410                 
1411                 return success;
1412         }
1413         catch(exception& e) {
1414                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1415                 exit(1);
1416         }
1417         
1418 }
1419
1420 //***************************************************************************************************************
1421
1422 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1423         try {
1424                 bool success = 1;
1425                 int length = oligo.length();
1426                 
1427                 for(int i=0;i<length;i++){
1428                         
1429                         if(oligo[i] != seq[i]){
1430                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1431                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1432                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1433                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1434                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1435                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1436                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1437                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1438                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1439                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1440                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1441                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1442                                 
1443                                 if(success == 0)        {       break;   }
1444                         }
1445                         else{
1446                                 success = 1;
1447                         }
1448                 }
1449                 
1450                 return success;
1451         }
1452         catch(exception& e) {
1453                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1454                 exit(1);
1455         }
1456
1457 }
1458
1459 //***************************************************************************************************************
1460
1461 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1462         try {
1463
1464                 int length = oligo.length();
1465                 int countDiffs = 0;
1466                 
1467                 for(int i=0;i<length;i++){
1468                                                                 
1469                         if(oligo[i] != seq[i]){
1470                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1471                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1472                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1473                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1474                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1475                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1476                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1477                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1478                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1479                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1480                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1481                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1482                         }
1483                         
1484                 }
1485                 
1486                 return countDiffs;
1487         }
1488         catch(exception& e) {
1489                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1490                 exit(1);
1491         }
1492
1493 }
1494
1495 //***************************************************************************************************************