]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
fixed trim.seqs weird characters thing
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::getValidParameters(){   
15         try {
16                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
17                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
18                                                                         "keepfirst", "removelast",
19                                                                         "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
20                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
21                 return myArray;
22         }
23         catch(exception& e) {
24                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
25                 exit(1);
26         }
27 }
28 //**********************************************************************************************************************
29 TrimSeqsCommand::TrimSeqsCommand(){     
30         try {
31                 abort = true;
32                 //initialize outputTypes
33                 vector<string> tempOutNames;
34                 outputTypes["fasta"] = tempOutNames;
35                 outputTypes["qual"] = tempOutNames;
36                 outputTypes["group"] = tempOutNames;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 vector<string> TrimSeqsCommand::getRequiredParameters(){        
45         try {
46                 string Array[] =  {"fasta"};
47                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48                 return myArray;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
52                 exit(1);
53         }
54 }
55 //**********************************************************************************************************************
56 vector<string> TrimSeqsCommand::getRequiredFiles(){     
57         try {
58                 vector<string> myArray;
59                 return myArray;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
63                 exit(1);
64         }
65 }
66 //***************************************************************************************************************
67
68 TrimSeqsCommand::TrimSeqsCommand(string option)  {
69         try {
70                 
71                 abort = false;
72                 comboStarts = 0;
73                 
74                 //allow user to run help
75                 if(option == "help") { help(); abort = true; }
76                 
77                 else {
78                         //valid paramters for this command
79                         string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
80                                                                 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
81                                                                 "keepfirst", "removelast",
82                                                                 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
83                         
84                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99                         outputTypes["fasta"] = tempOutNames;
100                         outputTypes["qual"] = tempOutNames;
101                         outputTypes["group"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("fasta");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
114                                 }
115                                 
116                                 it = parameters.find("oligos");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
122                                 }
123                                 
124                                 it = parameters.find("qfile");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
130                                 }
131                                 
132                                 it = parameters.find("group");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
138                                 }
139                         }
140
141                         
142                         //check for required parameters
143                         fastaFile = validParameter.validFile(parameters, "fasta", true);
144                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145                         else if (fastaFile == "not open") { abort = true; }     
146                         
147                         //if the user changes the output directory command factory will send this info to us in the output parameter 
148                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
149                                 outputDir = ""; 
150                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
151                         }
152                 
153                         
154                         //check for optional parameter and set defaults
155                         // ...at some point should added some additional type checking...
156                         string temp;
157                         temp = validParameter.validFile(parameters, "flip", false);
158                         if (temp == "not found"){       flip = 0;       }
159                         else if(m->isTrue(temp))        {       flip = 1;       }
160                 
161                         temp = validParameter.validFile(parameters, "oligos", true);
162                         if (temp == "not found"){       oligoFile = "";         }
163                         else if(temp == "not open"){    abort = true;   } 
164                         else                                    {       oligoFile = temp;               }
165                         
166                         temp = validParameter.validFile(parameters, "group", true);
167                         if (temp == "not found"){       groupfile = "";         }
168                         else if(temp == "not open"){    abort = true;   } 
169                         else                                    {       groupfile = temp;               }
170                         
171                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
172                         convert(temp, maxAmbig);  
173
174                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
175                         convert(temp, maxHomoP);  
176
177                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
178                         convert(temp, minLength); 
179                         
180                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
181                         convert(temp, maxLength);
182                         
183                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
184                         convert(temp, bdiffs);
185                         
186                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
187                         convert(temp, pdiffs);
188                         
189                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
190                         convert(temp, tdiffs);
191                         
192                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
193                         
194                         temp = validParameter.validFile(parameters, "qfile", true);     
195                         if (temp == "not found")        {       qFileName = "";         }
196                         else if(temp == "not open")     {       abort = true;           }
197                         else                                            {       qFileName = temp;       }
198                         
199                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
200                         convert(temp, qThreshold);
201                         
202                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "F"; }
203                         qtrim = m->isTrue(temp);
204
205                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
206                         convert(temp, qRollAverage);
207
208                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
209                         convert(temp, qWindowAverage);
210
211                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
212                         convert(temp, qWindowSize);
213
214                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
215                         convert(temp, qWindowStep);
216
217                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
218                         convert(temp, qAverage);
219
220                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
221                         convert(temp, keepFirst);
222
223                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
224                         convert(temp, removeLast);
225                         
226                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
227                         allFiles = m->isTrue(temp);
228                         
229                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
230                         convert(temp, processors); 
231                         
232                         if ((oligoFile != "") && (groupfile != "")) {
233                                 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
234                         }
235                                                                                                 
236                         
237                         if(allFiles && (oligoFile == "") && (groupfile == "")){
238                                 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file.  Ignoring the allfiles request."); m->mothurOutEndLine();
239                         }
240                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
241                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
242                                 qAverage=0;
243                                 qThreshold=0;
244                         }
245                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
246                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
247                                 abort = true;
248                         }
249                 }
250
251         }
252         catch(exception& e) {
253                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
254                 exit(1);
255         }
256 }
257 //**********************************************************************************************************************
258
259 void TrimSeqsCommand::help(){
260         try {
261                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
262                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
263                 m->mothurOut("The fasta parameter is required.\n");
264                 m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n");
265                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
266                 m->mothurOut("The oligos parameter .... The default is "".\n");
267                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
268                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
269                 m->mothurOut("The minlength parameter .... The default is 0.\n");
270                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
271                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
272                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
273                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
274                 m->mothurOut("The qfile parameter .....\n");
275                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
276                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
277                 m->mothurOut("The allfiles parameter .... The default is F.\n");
278                 m->mothurOut("The qtrim parameter .... The default is F.\n");
279                 m->mothurOut("The trim.seqs command should be in the following format: \n");
280                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
281                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
282                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
283                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
284                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
285
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "TrimSeqsCommand", "help");
289                 exit(1);
290         }
291 }
292
293
294 //***************************************************************************************************************
295
296 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
297
298 //***************************************************************************************************************
299
300 int TrimSeqsCommand::execute(){
301         try{
302         
303                 if (abort == true) { return 0; }
304                 
305                 numFPrimers = 0;  //this needs to be initialized
306                 numRPrimers = 0;
307                 vector<string> fastaFileNames;
308                 vector<string> qualFileNames;
309                 
310                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
311                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
312                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
314                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
315                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
316                 if (qFileName != "") {  outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile);  outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); }
317                 string groupFile = "";
318                 if (groupfile == "") { groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; }
319                 else{
320                         groupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "trim.groups";
321                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
322                         groupMap = new GroupMap(groupfile);
323                         groupMap->readMap();
324                         
325                         if(allFiles){
326                                 for (int i = 0; i < groupMap->namesOfGroups.size(); i++) {
327                                         groupToIndex[groupMap->namesOfGroups[i]] = i;
328                                         groupVector.push_back(groupMap->namesOfGroups[i]);
329                                         fastaFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) +  groupMap->namesOfGroups[i] + ".fasta"));
330                                         
331                                         //we append later, so we want to clear file
332                                         ofstream outRemove;
333                                         m->openOutputFile(fastaFileNames[i], outRemove);
334                                         outRemove.close();
335                                         if(qFileName != ""){
336                                                 qualFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) +  groupMap->namesOfGroups[i] + ".qual"));
337                                                 ofstream outRemove2;
338                                                 m->openOutputFile(qualFileNames[i], outRemove2);
339                                                 outRemove2.close();
340                                         }
341                                 }
342                         }
343                         comboStarts = fastaFileNames.size()-1;
344                 }
345                 
346                 if(oligoFile != ""){
347                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
348                         getOligos(fastaFileNames, qualFileNames);
349                 }
350
351                 vector<unsigned long int> fastaFilePos;
352                 vector<unsigned long int> qFilePos;
353                 
354                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
355                 
356                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
357                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
358                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
359                 }       
360                 if(qFileName == "")     {       qLines = lines; } //files with duds
361                 
362                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
363                                 if(processors == 1){
364                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
365                                 }else{
366                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); 
367                                 }       
368                 #else
369                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
370                 #endif
371                 
372                 if (m->control_pressed) {  return 0; }                  
373                         
374                 set<string> blanks;
375                 for(int i=0;i<fastaFileNames.size();i++){
376                         if (m->isBlank(fastaFileNames[i])) {   blanks.insert(fastaFileNames[i]);        }
377                         else if (filesToRemove.count(fastaFileNames[i]) > 0) {  remove(fastaFileNames[i].c_str()); }
378                         else {
379                                 ifstream inFASTA;
380                                 string seqName;
381                                 m->openInputFile(fastaFileNames[i], inFASTA);
382                                 ofstream outGroups;
383                                 string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups";
384                                 
385                                 //if the fastafile is on the blanks list then the groups file should be as well
386                                 if (blanks.count(fastaFileNames[i]) != 0) { blanks.insert(outGroupFilename); }
387                                 
388                                 m->openOutputFile(outGroupFilename, outGroups);
389                                 outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename);  
390                                 
391                                 string thisGroup = "";
392                                 if (i > comboStarts) {
393                                         map<string, int>::iterator itCombo;
394                                         for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
395                                                 if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
396                                         }
397                                 }else{ thisGroup = groupVector[i]; }
398                                         
399                                 while(!inFASTA.eof()){
400                                         if(inFASTA.get() == '>'){
401                                                 inFASTA >> seqName;
402                                                 outGroups << seqName << '\t' << thisGroup << endl;
403                                         }
404                                         while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
405                                 }
406                                 outGroups.close();
407                                 inFASTA.close();
408                         }
409                 }
410                 
411                 for (set<string>::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) {  remove((*(itBlanks)).c_str()); }
412                 
413                 blanks.clear();
414                 if(qFileName != ""){
415                         for(int i=0;i<qualFileNames.size();i++){
416                                 if (m->isBlank(qualFileNames[i])) {  blanks.insert(qualFileNames[i]);   }
417                                 else if (filesToRemove.count(qualFileNames[i]) > 0) {  remove(qualFileNames[i].c_str()); }
418                         }
419                 }
420                 
421                 for (set<string>::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) {  remove((*(itBlanks)).c_str()); }
422                 
423                 if (m->control_pressed) { 
424                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
425                         return 0;
426                 }
427
428                 m->mothurOutEndLine();
429                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
430                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
431                 m->mothurOutEndLine();
432                 
433                 return 0;       
434                         
435         }
436         catch(exception& e) {
437                 m->errorOut(e, "TrimSeqsCommand", "execute");
438                 exit(1);
439         }
440 }
441                 
442 /**************************************************************************************/
443
444 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames, linePair* line, linePair* qline) {      
445                 
446         try {
447                 
448                 ofstream outFASTA;
449                 int able = m->openOutputFile(trimFile, outFASTA);
450                 
451                 ofstream scrapFASTA;
452                 m->openOutputFile(scrapFile, scrapFASTA);
453                 
454                 ofstream outQual;
455                 ofstream scrapQual;
456                 if(qFileName != ""){
457                         m->openOutputFile(trimQFile, outQual);
458                         m->openOutputFile(scrapQFile, scrapQual);
459                 }
460                 
461                 ofstream outGroups;
462                 
463                 if (oligoFile != "") {          
464                         m->openOutputFile(groupFile, outGroups);   
465                 }
466                 
467                 ifstream inFASTA;
468                 m->openInputFile(filename, inFASTA);
469                 inFASTA.seekg(line->start);
470                 
471                 ifstream qFile;
472                 if(qFileName != "")     {       m->openInputFile(qFileName, qFile);     qFile.seekg(qline->start);  }
473                 
474                 
475                 for (int i = 0; i < fastaNames.size(); i++) { //clears old file
476                         ofstream temp;
477                         m->openOutputFile(fastaNames[i], temp);
478                         temp.close();
479                 }
480                 for (int i = 0; i < qualNames.size(); i++) { //clears old file
481                         ofstream temp;
482                         m->openOutputFile(qualNames[i], temp);
483                         temp.close();
484                 }
485                 
486                         
487                 bool done = false;
488                 int count = 0;
489         
490                 while (!done) {
491                                 
492                         if (m->control_pressed) { 
493                                 inFASTA.close(); outFASTA.close(); scrapFASTA.close();
494                                 if (oligoFile != "") {   outGroups.close();   }
495
496                                 if(qFileName != ""){
497                                         qFile.close();
498                                 }
499                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
500
501                                 return 0;
502                         }
503                         
504                         int success = 1;
505                         
506
507                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
508
509                         QualityScores currQual;
510                         if(qFileName != ""){
511                                 currQual = QualityScores(qFile, currSeq.getNumBases());  m->gobble(qFile);
512                         }
513                         
514                         string origSeq = currSeq.getUnaligned();
515                         if (origSeq != "") {
516                                 int groupBar, groupPrime;
517                                 string trashCode = "";
518                                 int currentSeqsDiffs = 0;
519
520                                 if(barcodes.size() != 0){
521                                         success = stripBarcode(currSeq, currQual, groupBar);
522                                         if(success > bdiffs)            {       trashCode += 'b';       }
523                                         else{ currentSeqsDiffs += success;  }
524                                 }
525                                 
526                                 if(numFPrimers != 0){
527                                         success = stripForward(currSeq, currQual, groupPrime);
528                                         if(success > pdiffs)            {       trashCode += 'f';       }
529                                         else{ currentSeqsDiffs += success;  }
530                                 }
531                                 
532                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
533                                 
534                                 if(numRPrimers != 0){
535                                         success = stripReverse(currSeq, currQual);
536                                         if(!success)                            {       trashCode += 'r';       }
537                                 }
538
539                                 if(keepFirst != 0){
540                                         success = keepFirstTrim(currSeq, currQual);
541                                 }
542                                 
543                                 if(removeLast != 0){
544                                         success = removeLastTrim(currSeq, currQual);
545                                         if(!success)                            {       trashCode += 'l';       }
546                                 }
547
548                                 
549                                 if(qFileName != ""){
550                                         
551                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
552                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
553                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
554                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
555                                         else                                            {       success = 1;                            }
556
557 //                                      if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
558 //                                              success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
559 //                                      }
560                                         
561                                         if(!success)                            {       trashCode += 'q';       }
562                                 }                               
563                 
564                                 if(minLength > 0 || maxLength > 0){
565                                         success = cullLength(currSeq);
566                                         if(!success)                            {       trashCode += 'l';       }
567                                 }
568                                 if(maxHomoP > 0){
569                                         success = cullHomoP(currSeq);
570                                         if(!success)                            {       trashCode += 'h';       }
571                                 }
572                                 if(maxAmbig != -1){
573                                         success = cullAmbigs(currSeq);
574                                         if(!success)                            {       trashCode += 'n';       }
575                                 }
576                                 
577                                 if(flip){               // should go last                       
578                                         currSeq.reverseComplement();
579                                         currQual.flipQScores(); 
580                                 }
581                                 
582                                 if(trashCode.length() == 0){
583                                         currSeq.setAligned(currSeq.getUnaligned());
584                                         currSeq.printSequence(outFASTA);
585                                         currQual.printQScores(outQual);
586                                         
587                                         if(barcodes.size() != 0){
588                                                 string thisGroup = groupVector[groupBar];
589                                                 int indexToFastaFile = groupBar;
590                                                 if (primers.size() != 0){
591                                                         //does this primer have a group
592                                                         if (groupVector[groupPrime] != "") {  
593                                                                 thisGroup += "." + groupVector[groupPrime]; 
594                                                                 indexToFastaFile = combos[thisGroup];
595                                                         }
596                                                 }
597                                                 outGroups << currSeq.getName() << '\t' << thisGroup << endl;
598                                                 if(allFiles){
599                                                         ofstream outTemp;
600                                                         m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp);
601                                                         //currSeq.printSequence(*fastaFileNames[indexToFastaFile]);
602                                                         currSeq.printSequence(outTemp);
603                                                         outTemp.close();
604                                                         
605                                                         if(qFileName != ""){
606                                                                 //currQual.printQScores(*qualFileNames[indexToFastaFile]);
607                                                                 ofstream outTemp2;
608                                                                 m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2);
609                                                                 currQual.printQScores(outTemp2);
610                                                                 outTemp2.close();                                                       
611                                                         }
612                                                 }
613                                         }
614                                         
615                                         if (groupfile != "") {
616                                                 string thisGroup = groupMap->getGroup(currSeq.getName());
617                                                 
618                                                 if (thisGroup != "not found") {
619                                                         outGroups << currSeq.getName() << '\t' << thisGroup << endl;
620                                                         if (allFiles) {
621                                                                 ofstream outTemp;
622                                                                 m->openOutputFileAppend(fastaNames[groupToIndex[thisGroup]], outTemp);
623                                                                 currSeq.printSequence(outTemp);
624                                                                 outTemp.close();
625                                                                 if(qFileName != ""){
626                                                                         ofstream outTemp2;
627                                                                         m->openOutputFileAppend(qualNames[groupToIndex[thisGroup]], outTemp2);
628                                                                         currQual.printQScores(outTemp2);
629                                                                         outTemp2.close();                                                       
630                                                                 }
631                                                         }
632                                                 }else{
633                                                         m->mothurOut(currSeq.getName() + " is not in your groupfile, adding to group XXX."); m->mothurOutEndLine();
634                                                         outGroups << currSeq.getName() << '\t' << "XXX" << endl;
635                                                         if (allFiles) {  
636                                                                 m->mothurOut("[ERROR]: " + currSeq.getName() + " will not be added to any .group.fasta or .group.qual file."); m->mothurOutEndLine();
637                                                         }
638                                                 }
639                                         }
640                                 }
641                                 else{
642                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
643                                         currSeq.setUnaligned(origSeq);
644                                         currSeq.setAligned(origSeq);
645                                         currSeq.printSequence(scrapFASTA);
646                                         currQual.printQScores(scrapQual);
647                                 }
648                                 count++;
649                         }
650                         
651                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
652                                 unsigned long int pos = inFASTA.tellg();
653                                 if ((pos == -1) || (pos >= line->end)) { break; }
654                         #else
655                                 if (inFASTA.eof()) { break; }
656                         #endif
657                                 
658                         //report progress
659                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
660                         
661                 }
662                 //report progress
663                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
664
665                 
666                 inFASTA.close();
667                 outFASTA.close();
668                 scrapFASTA.close();
669                 if (oligoFile != "") {   outGroups.close();   }
670                 if(qFileName != "")     {       qFile.close();  scrapQual.close(); outQual.close();     }
671                 
672                 return count;
673         }
674         catch(exception& e) {
675                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
676                 exit(1);
677         }
678 }
679
680 /**************************************************************************************************/
681
682 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames) {
683         try {
684 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
685                 int process = 1;
686                 int exitCommand = 1;
687                 processIDS.clear();
688                 
689                 //loop through and create all the processes you want
690                 while (process != processors) {
691                         int pid = fork();
692                         
693                         if (pid > 0) {
694                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
695                                 process++;
696                         }else if (pid == 0){
697                                 for (int i = 0; i < fastaNames.size(); i++) {
698                                         fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp");
699                                         //clear old file if it exists
700                                         ofstream temp;
701                                         m->openOutputFile(fastaNames[i], temp);
702                                         temp.close();
703                                         if(qFileName != ""){
704                                                 qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp");
705                                                 //clear old file if it exists
706                                                 ofstream temp2;
707                                                 m->openOutputFile(qualNames[i], temp2);
708                                                 temp2.close();
709                                         }
710                                 }
711                                 
712                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]);
713                                 exit(0);
714                         }else { 
715                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
716                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
717                                 exit(0);
718                         }
719                 }
720                 
721                 //parent do my part
722                 for (int i = 0; i < fastaNames.size(); i++) {
723                         //clear old file if it exists
724                         ofstream temp;
725                         m->openOutputFile(fastaNames[i], temp);
726                         temp.close();
727                         if(qFileName != ""){
728                                 //clear old file if it exists
729                                 ofstream temp2;
730                                 m->openOutputFile(qualNames[i], temp2);
731                                 temp2.close();
732                         }
733                 }
734                 
735                 driverCreateTrim(filename, qFileName, trimFile, scrapFile, trimQFile, scrapQFile, groupFile, fastaNames, qualNames, lines[0], qLines[0]);
736                 
737                 
738                 //force parent to wait until all the processes are done
739                 for (int i=0;i<processIDS.size();i++) { 
740                         int temp = processIDS[i];
741                         wait(&temp);
742                 }
743                 
744                 //append files
745                 for(int i=0;i<processIDS.size();i++){
746                         
747                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
748                         
749                         m->appendFiles((trimFile + toString(processIDS[i]) + ".temp"), trimFile);
750                         remove((trimFile + toString(processIDS[i]) + ".temp").c_str());
751                         m->appendFiles((scrapFile + toString(processIDS[i]) + ".temp"), scrapFile);
752                         remove((scrapFile + toString(processIDS[i]) + ".temp").c_str());
753                         
754                         m->mothurOut("Done with fasta files"); m->mothurOutEndLine();
755                         
756                         if(qFileName != ""){
757                                 m->appendFiles((trimQFile + toString(processIDS[i]) + ".temp"), trimQFile);
758                                 remove((trimQFile + toString(processIDS[i]) + ".temp").c_str());
759                                 m->appendFiles((scrapQFile + toString(processIDS[i]) + ".temp"), scrapQFile);
760                                 remove((scrapQFile + toString(processIDS[i]) + ".temp").c_str());
761                         
762                                 m->mothurOut("Done with quality files"); m->mothurOutEndLine();
763                         }
764                         
765                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
766                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
767                         
768                         m->mothurOut("Done with group file"); m->mothurOutEndLine();
769                         
770                         for (int j = 0; j < fastaNames.size(); j++) {
771                                 m->appendFiles((fastaNames[j] + toString(processIDS[i]) + ".temp"), fastaNames[j]);
772                                 remove((fastaNames[j] + toString(processIDS[i]) + ".temp").c_str());
773                         }
774                         
775                         if(qFileName != ""){
776                                 for (int j = 0; j < qualNames.size(); j++) {
777                                         m->appendFiles((qualNames[j] + toString(processIDS[i]) + ".temp"), qualNames[j]);
778                                         remove((qualNames[j] + toString(processIDS[i]) + ".temp").c_str());
779                                 }
780                         }
781                         
782                         if (allFiles) { m->mothurOut("Done with allfiles"); m->mothurOutEndLine(); }
783                 }
784         
785                 return exitCommand;
786 #endif          
787         }
788         catch(exception& e) {
789                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
790                 exit(1);
791         }
792 }
793
794 /**************************************************************************************************/
795
796 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
797         try {
798                 
799                 //set file positions for fasta file
800                 fastaFilePos = m->divideFile(filename, processors);
801                 
802                 if (qfilename == "") { return processors; }
803                 
804                 //get name of first sequence in each chunk
805                 map<string, int> firstSeqNames;
806                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
807                         ifstream in;
808                         m->openInputFile(filename, in);
809                         in.seekg(fastaFilePos[i]);
810                 
811                         Sequence temp(in); 
812                         firstSeqNames[temp.getName()] = i;
813                 
814                         in.close();
815                 }
816                                 
817                 //seach for filePos of each first name in the qfile and save in qfileFilePos
818                 ifstream inQual;
819                 m->openInputFile(qfilename, inQual);
820                 
821                 string input;
822                 while(!inQual.eof()){   
823                         input = m->getline(inQual);
824
825                         if (input.length() != 0) {
826                                 if(input[0] == '>'){ //this is a sequence name line
827                                         istringstream nameStream(input);
828                                         
829                                         string sname = "";  nameStream >> sname;
830                                         sname = sname.substr(1);
831                                         
832                                         map<string, int>::iterator it = firstSeqNames.find(sname);
833                                         
834                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
835                                                 unsigned long int pos = inQual.tellg(); 
836                                                 qfileFilePos.push_back(pos - input.length() - 1);       
837                                                 firstSeqNames.erase(it);
838                                         }
839                                 }
840                         }
841                         
842                         if (firstSeqNames.size() == 0) { break; }
843                 }
844                 inQual.close();
845                 
846                 
847                 if (firstSeqNames.size() != 0) { 
848                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
849                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
850                         }
851                         qFileName = "";
852                         return processors;
853                 }
854
855                 //get last file position of qfile
856                 FILE * pFile;
857                 unsigned long int size;
858                 
859                 //get num bytes in file
860                 pFile = fopen (qfilename.c_str(),"rb");
861                 if (pFile==NULL) perror ("Error opening file");
862                 else{
863                         fseek (pFile, 0, SEEK_END);
864                         size=ftell (pFile);
865                         fclose (pFile);
866                 }
867                 
868                 qfileFilePos.push_back(size);
869                 
870                 return processors;
871         }
872         catch(exception& e) {
873                 m->errorOut(e, "TrimSeqsCommand", "setLines");
874                 exit(1);
875         }
876 }
877 //***************************************************************************************************************
878
879 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec, vector<string>& outQualVec){
880         try {
881                 ifstream inOligos;
882                 m->openInputFile(oligoFile, inOligos);
883                 
884                 ofstream test;
885                 
886                 string type, oligo, group;
887                 int index=0;
888                 //int indexPrimer = 0;
889                 
890                 while(!inOligos.eof()){
891                         inOligos >> type; m->gobble(inOligos);
892                                         
893                         if(type[0] == '#'){
894                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
895                         }
896                         else{
897                                 //make type case insensitive
898                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
899                                 
900                                 inOligos >> oligo;
901                                 
902                                 for(int i=0;i<oligo.length();i++){
903                                         oligo[i] = toupper(oligo[i]);
904                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
905                                 }
906                                 
907                                 if(type == "FORWARD"){
908                                         group = "";
909                                         
910                                         // get rest of line in case there is a primer name
911                                         while (!inOligos.eof()) {       
912                                                 char c = inOligos.get(); 
913                                                 if (c == 10 || c == 13){        break;  }
914                                                 else if (c == 32 || c == 9){;} //space or tab
915                                                 else {  group += c;  }
916                                         } 
917                                         
918                                         //check for repeat barcodes
919                                         map<string, int>::iterator itPrime = primers.find(oligo);
920                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
921                                         
922                                                 primers[oligo]=index; index++;
923                                                 groupVector.push_back(group);
924                                         
925                                                 if(allFiles){
926                                                         outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
927                                                         if(qFileName != ""){
928                                                                 outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
929                                                         }
930                                                         if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct
931                                                                 filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
932                                                                 if(qFileName != ""){
933                                                                         filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
934                                                                 }
935                                                         }else {
936                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
937                                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
938                                                                 if(qFileName != ""){
939                                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
940                                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
941                                                                 }                                                       
942                                                         }
943                                                 }
944                                         
945                                 }
946                                 else if(type == "REVERSE"){
947                                         Sequence oligoRC("reverse", oligo);
948                                         oligoRC.reverseComplement();
949                                         revPrimer.push_back(oligoRC.getUnaligned());
950                                 }
951                                 else if(type == "BARCODE"){
952                                         inOligos >> group;
953                                         
954                                         //check for repeat barcodes
955                                         map<string, int>::iterator itBar = barcodes.find(oligo);
956                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
957                                         
958                                                 barcodes[oligo]=index; index++;
959                                                 groupVector.push_back(group);
960                                                 
961                                                 if(allFiles){
962                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
963                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
964                                                         outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
965                                                         if(qFileName != ""){
966                                                                 outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
967                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
968                                                                 outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
969                                                         }                                                       
970                                                 }
971                                         
972                                 }else{  m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
973                         }
974                         m->gobble(inOligos);
975                 }
976                 
977                 inOligos.close();
978                 
979                 //add in potential combos
980                 if(allFiles){
981                         comboStarts = outFASTAVec.size()-1;
982                         for (map<string, int>::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) {
983                                 for (map<string, int>::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) {
984                                         if (groupVector[itPrime->second] != "") { //there is a group for this primer
985                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
986                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
987                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
988                                                 combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1;
989                                                 
990                                                 if(qFileName != ""){
991                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
992                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
993                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
994                                                 }
995                                         }
996                                 }
997                         }
998                 }
999                 
1000                 numFPrimers = primers.size();
1001                 numRPrimers = revPrimer.size();
1002                 
1003         }
1004         catch(exception& e) {
1005                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1006                 exit(1);
1007         }
1008 }
1009 //***************************************************************************************************************
1010
1011 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1012         try {
1013                 
1014                 string rawSequence = seq.getUnaligned();
1015                 int success = bdiffs + 1;       //guilty until proven innocent
1016                 
1017                 //can you find the barcode
1018                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1019                         string oligo = it->first;
1020                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1021                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1022                                 break;  
1023                         }
1024                         
1025                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1026                                 group = it->second;
1027                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1028                                 
1029                                 if(qual.getName() != ""){
1030                                         qual.trimQScores(oligo.length(), -1);
1031                                 }
1032                                 
1033                                 success = 0;
1034                                 break;
1035                         }
1036                 }
1037                 
1038                 //if you found the barcode or if you don't want to allow for diffs
1039 //              cout << success;
1040                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1041                 
1042                 else { //try aligning and see if you can find it
1043 //                      cout << endl;
1044
1045                         int maxLength = 0;
1046
1047                         Alignment* alignment;
1048                         if (barcodes.size() > 0) {
1049                                 map<string,int>::iterator it=barcodes.begin();
1050
1051                                 for(it;it!=barcodes.end();it++){
1052                                         if(it->first.length() > maxLength){
1053                                                 maxLength = it->first.length();
1054                                         }
1055                                 }
1056                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1057
1058                         }else{ alignment = NULL; } 
1059                         
1060                         //can you find the barcode
1061                         int minDiff = 1e6;
1062                         int minCount = 1;
1063                         int minGroup = -1;
1064                         int minPos = 0;
1065                         
1066                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1067                                 string oligo = it->first;
1068 //                              int length = oligo.length();
1069                                 
1070                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1071                                         success = bdiffs + 10;
1072                                         break;
1073                                 }
1074                                 
1075                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1076                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1077                                 oligo = alignment->getSeqAAln();
1078                                 string temp = alignment->getSeqBAln();
1079                 
1080                                 int alnLength = oligo.length();
1081                                 
1082                                 for(int i=oligo.length()-1;i>=0;i--){
1083                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1084                                 }
1085                                 oligo = oligo.substr(0,alnLength);
1086                                 temp = temp.substr(0,alnLength);
1087                                 
1088                                 int newStart=0;
1089                                 int numDiff = countDiffs(oligo, temp);
1090                                 
1091 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1092                                 
1093                                 if(numDiff < minDiff){
1094                                         minDiff = numDiff;
1095                                         minCount = 1;
1096                                         minGroup = it->second;
1097                                         minPos = 0;
1098                                         for(int i=0;i<alnLength;i++){
1099                                                 if(temp[i] != '-'){
1100                                                         minPos++;
1101                                                 }
1102                                         }
1103                                 }
1104                                 else if(numDiff == minDiff){
1105                                         minCount++;
1106                                 }
1107
1108                         }
1109
1110                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1111                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1112                         else{                                                                                                   //use the best match
1113                                 group = minGroup;
1114                                 seq.setUnaligned(rawSequence.substr(minPos));
1115                                 
1116                                 if(qual.getName() != ""){
1117                                         qual.trimQScores(minPos, -1);
1118                                 }
1119                                 success = minDiff;
1120                         }
1121                         
1122                         if (alignment != NULL) {  delete alignment;  }
1123                         
1124                 }
1125 //              cout << success << endl;
1126                 
1127                 return success;
1128                 
1129         }
1130         catch(exception& e) {
1131                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1132                 exit(1);
1133         }
1134
1135 }
1136
1137 //***************************************************************************************************************
1138
1139 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1140         try {
1141                 string rawSequence = seq.getUnaligned();
1142                 int success = pdiffs + 1;       //guilty until proven innocent
1143                 
1144                 //can you find the primer
1145                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1146                         string oligo = it->first;
1147                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1148                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1149                                 break;  
1150                         }
1151                         
1152                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1153                                 group = it->second;
1154                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1155                                 if(qual.getName() != ""){
1156                                         qual.trimQScores(oligo.length(), -1);
1157                                 }
1158                                 success = 0;
1159                                 break;
1160                         }
1161                 }
1162
1163                 //if you found the barcode or if you don't want to allow for diffs
1164 //              cout << success;
1165                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1166                 
1167                 else { //try aligning and see if you can find it
1168 //                      cout << endl;
1169
1170                         int maxLength = 0;
1171
1172                         Alignment* alignment;
1173                         if (primers.size() > 0) {
1174                                 map<string,int>::iterator it=primers.begin();
1175
1176                                 for(it;it!=primers.end();it++){
1177                                         if(it->first.length() > maxLength){
1178                                                 maxLength = it->first.length();
1179                                         }
1180                                 }
1181                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1182
1183                         }else{ alignment = NULL; } 
1184                         
1185                         //can you find the barcode
1186                         int minDiff = 1e6;
1187                         int minCount = 1;
1188                         int minGroup = -1;
1189                         int minPos = 0;
1190                         
1191                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1192                                 string oligo = it->first;
1193 //                              int length = oligo.length();
1194                                 
1195                                 if(rawSequence.length() < maxLength){   
1196                                         success = pdiffs + 100;
1197                                         break;
1198                                 }
1199                                 
1200                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1201                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1202                                 oligo = alignment->getSeqAAln();
1203                                 string temp = alignment->getSeqBAln();
1204                 
1205                                 int alnLength = oligo.length();
1206                                 
1207                                 for(int i=oligo.length()-1;i>=0;i--){
1208                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1209                                 }
1210                                 oligo = oligo.substr(0,alnLength);
1211                                 temp = temp.substr(0,alnLength);
1212                                 
1213                                 int newStart=0;
1214                                 int numDiff = countDiffs(oligo, temp);
1215                                 
1216 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1217                                 
1218                                 if(numDiff < minDiff){
1219                                         minDiff = numDiff;
1220                                         minCount = 1;
1221                                         minGroup = it->second;
1222                                         minPos = 0;
1223                                         for(int i=0;i<alnLength;i++){
1224                                                 if(temp[i] != '-'){
1225                                                         minPos++;
1226                                                 }
1227                                         }
1228                                 }
1229                                 else if(numDiff == minDiff){
1230                                         minCount++;
1231                                 }
1232
1233                         }
1234
1235                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1236                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1237                         else{                                                                                                   //use the best match
1238                                 group = minGroup;
1239                                 seq.setUnaligned(rawSequence.substr(minPos));
1240                                 if(qual.getName() != ""){
1241                                         qual.trimQScores(minPos, -1);
1242                                 }
1243                                 success = minDiff;
1244                         }
1245                         
1246                         if (alignment != NULL) {  delete alignment;  }
1247                         
1248                 }
1249                 
1250                 return success;
1251
1252         }
1253         catch(exception& e) {
1254                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1255                 exit(1);
1256         }
1257 }
1258
1259 //***************************************************************************************************************
1260
1261 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1262         try {
1263                 string rawSequence = seq.getUnaligned();
1264                 bool success = 0;       //guilty until proven innocent
1265                 
1266                 for(int i=0;i<numRPrimers;i++){
1267                         string oligo = revPrimer[i];
1268                         
1269                         if(rawSequence.length() < oligo.length()){
1270                                 success = 0;
1271                                 break;
1272                         }
1273                         
1274                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1275                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1276                                 if(qual.getName() != ""){
1277                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1278                                 }
1279                                 success = 1;
1280                                 break;
1281                         }
1282                 }       
1283                 return success;
1284                 
1285         }
1286         catch(exception& e) {
1287                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1288                 exit(1);
1289         }
1290 }
1291
1292 //***************************************************************************************************************
1293
1294 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1295         try {
1296                 bool success = 1;
1297                 if(qscores.getName() != ""){
1298                         qscores.trimQScores(-1, keepFirst);
1299                 }
1300                 sequence.trim(keepFirst);
1301                 return success;
1302         }
1303         catch(exception& e) {
1304                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1305                 exit(1);
1306         }
1307         
1308 }       
1309
1310 //***************************************************************************************************************
1311
1312 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1313         try {
1314                 bool success = 0;
1315                 
1316                 int length = sequence.getNumBases() - removeLast;
1317                 
1318                 if(length > 0){
1319                         if(qscores.getName() != ""){
1320                                 qscores.trimQScores(-1, length);
1321                         }
1322                         sequence.trim(length);
1323                         success = 1;
1324                 }
1325                 else{
1326                         success = 0;
1327                 }
1328
1329                 return success;
1330         }
1331         catch(exception& e) {
1332                 m->errorOut(e, "removeLastTrim", "countDiffs");
1333                 exit(1);
1334         }
1335         
1336 }       
1337
1338 //***************************************************************************************************************
1339
1340 bool TrimSeqsCommand::cullLength(Sequence& seq){
1341         try {
1342         
1343                 int length = seq.getNumBases();
1344                 bool success = 0;       //guilty until proven innocent
1345                 
1346                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1347                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1348                 else                                                                                            {       success = 0;    }
1349                 
1350                 return success;
1351         
1352         }
1353         catch(exception& e) {
1354                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1355                 exit(1);
1356         }
1357         
1358 }
1359
1360 //***************************************************************************************************************
1361
1362 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1363         try {
1364                 int longHomoP = seq.getLongHomoPolymer();
1365                 bool success = 0;       //guilty until proven innocent
1366                 
1367                 if(longHomoP <= maxHomoP){      success = 1;    }
1368                 else                                    {       success = 0;    }
1369                 
1370                 return success;
1371         }
1372         catch(exception& e) {
1373                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1374                 exit(1);
1375         }
1376         
1377 }
1378
1379 //***************************************************************************************************************
1380
1381 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1382         try {
1383                 int numNs = seq.getAmbigBases();
1384                 bool success = 0;       //guilty until proven innocent
1385                 
1386                 if(numNs <= maxAmbig)   {       success = 1;    }
1387                 else                                    {       success = 0;    }
1388                 
1389                 return success;
1390         }
1391         catch(exception& e) {
1392                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1393                 exit(1);
1394         }
1395         
1396 }
1397
1398 //***************************************************************************************************************
1399
1400 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1401         try {
1402                 bool success = 1;
1403                 int length = oligo.length();
1404                 
1405                 for(int i=0;i<length;i++){
1406                         
1407                         if(oligo[i] != seq[i]){
1408                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1409                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1410                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1411                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1412                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1413                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1414                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1415                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1416                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1417                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1418                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1419                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1420                                 
1421                                 if(success == 0)        {       break;   }
1422                         }
1423                         else{
1424                                 success = 1;
1425                         }
1426                 }
1427                 
1428                 return success;
1429         }
1430         catch(exception& e) {
1431                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1432                 exit(1);
1433         }
1434
1435 }
1436 //***************************************************************************************************************
1437
1438 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1439         try {
1440
1441                 int length = oligo.length();
1442                 int countDiffs = 0;
1443                 
1444                 for(int i=0;i<length;i++){
1445                                                                 
1446                         if(oligo[i] != seq[i]){
1447                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1448                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1449                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1450                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1451                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1452                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1453                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1454                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1455                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1456                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1457                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1458                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1459                         }
1460                         
1461                 }
1462                 
1463                 return countDiffs;
1464         }
1465         catch(exception& e) {
1466                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1467                 exit(1);
1468         }
1469
1470 }
1471
1472 //***************************************************************************************************************