]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
added getCommandInfoCommand for gui
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19                 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
20                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
21                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
22                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
23                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
24                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
25                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
26                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
29                 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
30                 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
31                 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
32                 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
33                 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
34                 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
35                 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
36                 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
37                 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                         
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimSeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string TrimSeqsCommand::getHelpString(){        
52         try {
53                 string helpString = "";
54                 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
55                 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
56                 helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
57                 helpString += "The fasta parameter is required.\n";
58                 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
59                 helpString += "The oligos parameter allows you to provide an oligos file.\n";
60                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
61                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
62                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
63                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
64                 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
65                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
66                 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
67                 helpString += "The qfile parameter allows you to provide a quality file.\n";
68                 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
69                 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
70                 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
71                 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
72                 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
73                 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
74                 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
75                 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
76                 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
77                 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
78                 helpString += "The trim.seqs command should be in the following format: \n";
79                 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n";
80                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
81                 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
82                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
83                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
84                 return helpString;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
88                 exit(1);
89         }
90 }
91
92
93 //**********************************************************************************************************************
94
95 TrimSeqsCommand::TrimSeqsCommand(){     
96         try {
97                 abort = true; calledHelp = true; 
98                 setParameters();
99                 vector<string> tempOutNames;
100                 outputTypes["fasta"] = tempOutNames;
101                 outputTypes["qfile"] = tempOutNames;
102                 outputTypes["group"] = tempOutNames;
103         }
104         catch(exception& e) {
105                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
106                 exit(1);
107         }
108 }
109 //***************************************************************************************************************
110
111 TrimSeqsCommand::TrimSeqsCommand(string option)  {
112         try {
113                 
114                 abort = false; calledHelp = false;   
115                 comboStarts = 0;
116                 
117                 //allow user to run help
118                 if(option == "help") { help(); abort = true; calledHelp = true; }
119                 
120                 else {
121                         vector<string> myArray = setParameters();
122                         
123                         OptionParser parser(option);
124                         map<string,string> parameters = parser.getParameters();
125                         
126                         ValidParameters validParameter;
127                         map<string,string>::iterator it;
128                         
129                         //check to make sure all parameters are valid for command
130                         for (it = parameters.begin(); it != parameters.end(); it++) { 
131                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
132                         }
133                         
134                         //initialize outputTypes
135                         vector<string> tempOutNames;
136                         outputTypes["fasta"] = tempOutNames;
137                         outputTypes["qfile"] = tempOutNames;
138                         outputTypes["group"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("fasta");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
151                                 }
152                                 
153                                 it = parameters.find("oligos");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
159                                 }
160                                 
161                                 it = parameters.find("qfile");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
167                                 }
168                                 
169                         }
170
171                         
172                         //check for required parameters
173                         fastaFile = validParameter.validFile(parameters, "fasta", true);
174                         if (fastaFile == "not found") {                                 
175                                 fastaFile = m->getFastaFile(); 
176                                 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
177                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
178                         }else if (fastaFile == "not open") { abort = true; }    
179                         
180                         //if the user changes the output directory command factory will send this info to us in the output parameter 
181                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
182                                 outputDir = ""; 
183                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
184                         }
185                 
186                         
187                         //check for optional parameter and set defaults
188                         // ...at some point should added some additional type checking...
189                         string temp;
190                         temp = validParameter.validFile(parameters, "flip", false);
191                         if (temp == "not found"){       flip = 0;       }
192                         else if(m->isTrue(temp))        {       flip = 1;       }
193                 
194                         temp = validParameter.validFile(parameters, "oligos", true);
195                         if (temp == "not found"){       oligoFile = "";         }
196                         else if(temp == "not open"){    abort = true;   } 
197                         else                                    {       oligoFile = temp;               }
198                         
199                         
200                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
201                         convert(temp, maxAmbig);  
202
203                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
204                         convert(temp, maxHomoP);  
205
206                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
207                         convert(temp, minLength); 
208                         
209                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
210                         convert(temp, maxLength);
211                         
212                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
213                         convert(temp, bdiffs);
214                         
215                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
216                         convert(temp, pdiffs);
217                         
218                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
219                         convert(temp, tdiffs);
220                         
221                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
222                         
223                         temp = validParameter.validFile(parameters, "qfile", true);     
224                         if (temp == "not found")        {       qFileName = "";         }
225                         else if(temp == "not open")     {       abort = true;           }
226                         else                                            {       qFileName = temp;       }
227                         
228                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
229                         convert(temp, qThreshold);
230                         
231                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
232                         qtrim = m->isTrue(temp);
233
234                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
235                         convert(temp, qRollAverage);
236
237                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
238                         convert(temp, qWindowAverage);
239
240                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
241                         convert(temp, qWindowSize);
242
243                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
244                         convert(temp, qWindowStep);
245
246                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
247                         convert(temp, qAverage);
248
249                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
250                         convert(temp, keepFirst);
251
252                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
253                         convert(temp, removeLast);
254                         
255                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
256                         allFiles = m->isTrue(temp);
257                         
258                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
259                         m->setProcessors(temp);
260                         convert(temp, processors); 
261                         
262                         
263                         if(allFiles && (oligoFile == "")){
264                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
265                         }
266                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
267                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
268                                 qAverage=0;
269                                 qThreshold=0;
270                         }
271                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
272                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
273                                 abort = true;
274                         }
275                 }
276
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
280                 exit(1);
281         }
282 }
283 //***************************************************************************************************************
284
285 int TrimSeqsCommand::execute(){
286         try{
287         
288                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
289                 
290                 numFPrimers = 0;  //this needs to be initialized
291                 numRPrimers = 0;
292                 vector<vector<string> > fastaFileNames;
293                 vector<vector<string> > qualFileNames;
294                 
295                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
296                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
297                 
298                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
299                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
300                 
301                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
302                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
303                 if (qFileName != "") {
304                         outputNames.push_back(trimQualFile);
305                         outputNames.push_back(scrapQualFile);
306                         outputTypes["qfile"].push_back(trimQualFile);
307                         outputTypes["qfile"].push_back(scrapQualFile); 
308                 }
309                 
310                 string outputGroupFileName;
311                 if(oligoFile != ""){
312                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
313                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
314                         getOligos(fastaFileNames, qualFileNames);
315                 }
316
317                 vector<unsigned long int> fastaFilePos;
318                 vector<unsigned long int> qFilePos;
319                 
320                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
321                 
322                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
323                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
324                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
325                 }       
326                 if(qFileName == "")     {       qLines = lines; } //files with duds
327                 
328                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
329                                 if(processors == 1){
330                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
331                                 }else{
332                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
333                                 }       
334                 #else
335                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
336                 #endif
337                 
338                 if (m->control_pressed) {  return 0; }                  
339                         
340                 if(allFiles){
341                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
342                         map<string, string>::iterator it;
343                         set<string> namesToRemove;
344                         for(int i=0;i<fastaFileNames.size();i++){
345                                 for(int j=0;j<fastaFileNames[0].size();j++){
346                                         if (fastaFileNames[i][j] != "") {
347                                                 if(m->isBlank(fastaFileNames[i][j])){
348                                                         remove(fastaFileNames[i][j].c_str());
349                                                         namesToRemove.insert(fastaFileNames[i][j]);
350                                                         
351                                                         if(qFileName != ""){
352                                                                 remove(qualFileNames[i][j].c_str());
353                                                                 namesToRemove.insert(qualFileNames[i][j]);
354                                                         }
355                                                 }else{  
356                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
357                                                         if (it == uniqueFastaNames.end()) {     
358                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
359                                                         }       
360                                                 }
361                                         }
362                                 }
363                         }
364                         
365                         //remove names for outputFileNames, just cleans up the output
366                         vector<string> outputNames2;
367                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
368                         outputNames = outputNames2;
369                         
370                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
371                                 ifstream in;
372                                 m->openInputFile(it->first, in);
373                                 
374                                 ofstream out;
375                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
376                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
377                                 m->openOutputFile(thisGroupName, out);
378                                 
379                                 while (!in.eof()){
380                                         if (m->control_pressed) { break; }
381                                         
382                                         Sequence currSeq(in); m->gobble(in);
383                                         out << currSeq.getName() << '\t' << it->second << endl;
384                                 }
385                                 in.close();
386                                 out.close();
387                         }
388                 }
389                 
390                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
391
392                 //output group counts
393                 m->mothurOutEndLine();
394                 int total = 0;
395                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
396                          total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); 
397                 }
398                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
399                 
400                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
401
402                 //set fasta file as new current fastafile
403                 string current = "";
404                 itTypes = outputTypes.find("fasta");
405                 if (itTypes != outputTypes.end()) {
406                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
407                 }
408                 
409                 itTypes = outputTypes.find("qfile");
410                 if (itTypes != outputTypes.end()) {
411                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
412                 }
413                 
414                 itTypes = outputTypes.find("group");
415                 if (itTypes != outputTypes.end()) {
416                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
417                 }
418
419                 m->mothurOutEndLine();
420                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
421                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
422                 m->mothurOutEndLine();
423                 
424                 return 0;       
425                         
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "TrimSeqsCommand", "execute");
429                 exit(1);
430         }
431 }
432                 
433 /**************************************************************************************/
434
435 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
436                 
437         try {
438                 
439                 ofstream trimFASTAFile;
440                 m->openOutputFile(trimFileName, trimFASTAFile);
441                 
442                 ofstream scrapFASTAFile;
443                 m->openOutputFile(scrapFileName, scrapFASTAFile);
444                 
445                 ofstream trimQualFile;
446                 ofstream scrapQualFile;
447                 if(qFileName != ""){
448                         m->openOutputFile(trimQFileName, trimQualFile);
449                         m->openOutputFile(scrapQFileName, scrapQualFile);
450                 }
451                 
452                 ofstream outGroupsFile;
453                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
454                 if(allFiles){
455                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
456                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
457                                         if (fastaFileNames[i][j] != "") {
458                                                 ofstream temp;
459                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
460                                                 if(qFileName != ""){
461                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
462                                                 }
463                                         }
464                                 }
465                         }
466                 }
467                 
468                 ifstream inFASTA;
469                 m->openInputFile(filename, inFASTA);
470                 inFASTA.seekg(line->start);
471                 
472                 ifstream qFile;
473                 if(qFileName != "")     {
474                         m->openInputFile(qFileName, qFile);
475                         qFile.seekg(qline->start);  
476                 }
477                 
478                 int count = 0;
479                 bool moreSeqs = 1;
480         
481                 while (moreSeqs) {
482                                 
483                         if (m->control_pressed) { 
484                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
485                                 if (oligoFile != "") {   outGroupsFile.close();   }
486
487                                 if(qFileName != ""){
488                                         qFile.close();
489                                 }
490                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
491
492                                 return 0;
493                         }
494                         
495                         int success = 1;
496                         string trashCode = "";
497                         int currentSeqsDiffs = 0;
498
499                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
500
501                         QualityScores currQual;
502                         if(qFileName != ""){
503                                 currQual = QualityScores(qFile);  m->gobble(qFile);
504                         }
505
506                         string origSeq = currSeq.getUnaligned();
507                         if (origSeq != "") {
508                                 
509                                 int barcodeIndex = 0;
510                                 int primerIndex = 0;
511                                 
512                                 if(barcodes.size() != 0){
513                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
514                                         if(success > bdiffs)            {       trashCode += 'b';       }
515                                         else{ currentSeqsDiffs += success;  }
516                                 }
517                                 
518                                 if(numFPrimers != 0){
519                                         success = stripForward(currSeq, currQual, primerIndex);
520                                         if(success > pdiffs)            {       trashCode += 'f';       }
521                                         else{ currentSeqsDiffs += success;  }
522                                 }
523                                 
524                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
525                                 
526                                 if(numRPrimers != 0){
527                                         success = stripReverse(currSeq, currQual);
528                                         if(!success)                            {       trashCode += 'r';       }
529                                 }
530
531                                 if(keepFirst != 0){
532                                         success = keepFirstTrim(currSeq, currQual);
533                                 }
534                                 
535                                 if(removeLast != 0){
536                                         success = removeLastTrim(currSeq, currQual);
537                                         if(!success)                            {       trashCode += 'l';       }
538                                 }
539
540                                 
541                                 if(qFileName != ""){
542                                         int origLength = currSeq.getNumBases();
543                                         
544                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
545                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
546                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
547                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
548                                         else                                            {       success = 1;                            }
549                                         
550                                         //you don't want to trim, if it fails above then scrap it
551                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
552                                         
553                                         if(!success)                            {       trashCode += 'q';       }
554                                 }                               
555                 
556                                 if(minLength > 0 || maxLength > 0){
557                                         success = cullLength(currSeq);
558                                         if(!success)                            {       trashCode += 'l';       }
559                                 }
560                                 if(maxHomoP > 0){
561                                         success = cullHomoP(currSeq);
562                                         if(!success)                            {       trashCode += 'h';       }
563                                 }
564                                 if(maxAmbig != -1){
565                                         success = cullAmbigs(currSeq);
566                                         if(!success)                            {       trashCode += 'n';       }
567                                 }
568                                 
569                                 if(flip){               // should go last                       
570                                         currSeq.reverseComplement();
571                                         if(qFileName != ""){
572                                                 currQual.flipQScores(); 
573                                         }
574                                 }
575                                 
576                                 if(trashCode.length() == 0){
577                                         currSeq.setAligned(currSeq.getUnaligned());
578                                         currSeq.printSequence(trimFASTAFile);
579                                         
580                                         if(qFileName != ""){
581                                                 currQual.printQScores(trimQualFile);
582                                         }
583                                         
584                                         if(barcodes.size() != 0){
585                                                 string thisGroup = barcodeNameVector[barcodeIndex];
586                                                 if (primers.size() != 0) { thisGroup += "." + primerNameVector[primerIndex]; }
587                                                 
588                                                 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
589                                                 
590                                                 map<string, int>::iterator it = groupCounts.find(thisGroup);
591                                                 if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
592                                                 else { groupCounts[it->first]++; }
593                                                         
594                                         }
595                                         
596                                         
597                                         if(allFiles){
598                                                 ofstream output;
599                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
600                                                 currSeq.printSequence(output);
601                                                 output.close();
602                                                 
603                                                 if(qFileName != ""){
604                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
605                                                         currQual.printQScores(output);
606                                                         output.close();                                                 
607                                                 }
608                                         }
609                                 }
610                                 else{
611                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
612                                         currSeq.setUnaligned(origSeq);
613                                         currSeq.setAligned(origSeq);
614                                         currSeq.printSequence(scrapFASTAFile);
615                                         if(qFileName != ""){
616                                                 currQual.printQScores(scrapQualFile);
617                                         }
618                                 }
619                                 count++;
620                         }
621                         
622                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
623                                 unsigned long int pos = inFASTA.tellg();
624                                 if ((pos == -1) || (pos >= line->end)) { break; }
625                         #else
626                                 if (inFASTA.eof()) { break; }
627                         #endif
628                                 
629                         //report progress
630                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
631                         
632                 }
633                 //report progress
634                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
635
636                 
637                 inFASTA.close();
638                 trimFASTAFile.close();
639                 scrapFASTAFile.close();
640                 if (oligoFile != "") {   outGroupsFile.close();   }
641                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
642                 
643                 return count;
644         }
645         catch(exception& e) {
646                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
647                 exit(1);
648         }
649 }
650
651 /**************************************************************************************************/
652
653 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
654         try {
655 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
656                 int process = 1;
657                 int exitCommand = 1;
658                 processIDS.clear();
659                 
660                 //loop through and create all the processes you want
661                 while (process != processors) {
662                         int pid = fork();
663                         
664                         if (pid > 0) {
665                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
666                                 process++;
667                         }else if (pid == 0){
668                                 
669                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
670                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
671
672                                 if(allFiles){
673                                         ofstream temp;
674
675                                         for(int i=0;i<tempFASTAFileNames.size();i++){
676                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
677                                                         if (tempFASTAFileNames[i][j] != "") {
678                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
679                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
680
681                                                                 if(qFileName != ""){
682                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
683                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
684                                                                 }
685                                                         }
686                                                 }
687                                         }
688                                 }
689                                                         
690                                 driverCreateTrim(filename,
691                                                                  qFileName,
692                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
693                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
694                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
695                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
696                                                                  (groupFile + toString(getpid()) + ".temp"),
697                                                                  tempFASTAFileNames,
698                                                                  tempPrimerQualFileNames,
699                                                                  lines[process],
700                                                                  qLines[process]);
701                                 
702                                 //pass groupCounts to parent
703                                 ofstream out;
704                                 string tempFile = filename + toString(getpid()) + ".num.temp";
705                                 m->openOutputFile(tempFile, out);
706                                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
707                                         out << it->first << '\t' << it->second << endl;
708                                 }
709                                 out.close();
710                                 
711                                 exit(0);
712                         }else { 
713                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
714                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
715                                 exit(0);
716                         }
717                 }
718                 
719                 //parent do my part
720                 ofstream temp;
721                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
722                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
723                 m->openOutputFile(trimQualFileName, temp);              temp.close();
724                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
725
726                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
727                 
728                 //force parent to wait until all the processes are done
729                 for (int i=0;i<processIDS.size();i++) { 
730                         int temp = processIDS[i];
731                         wait(&temp);
732                 }
733                 
734                 //append files
735                 for(int i=0;i<processIDS.size();i++){
736                         
737                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
738                         
739                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
740                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
741                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
742                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
743                         
744                         if(qFileName != ""){
745                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
746                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
747                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
748                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
749                         }
750                         
751                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
752                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
753                         
754                         
755                         if(allFiles){
756                                 for(int j=0;j<fastaFileNames.size();j++){
757                                         for(int k=0;k<fastaFileNames[j].size();k++){
758                                                 if (fastaFileNames[j][k] != "") {
759                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
760                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
761                                                         
762                                                         if(qFileName != ""){
763                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
764                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
765                                                         }
766                                                 }
767                                         }
768                                 }
769                         }
770                         
771                         ifstream in;
772                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
773                         m->openInputFile(tempFile, in);
774                         int tempNum;
775                         string group;
776                         while (!in.eof()) { 
777                                 in >> group >> tempNum; m->gobble(in);
778                                 
779                                 map<string, int>::iterator it = groupCounts.find(group);
780                                 if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
781                                 else { groupCounts[it->first] += tempNum; }
782                         }
783                         in.close(); remove(tempFile.c_str());
784                         
785                 }
786         
787                 return exitCommand;
788 #endif          
789         }
790         catch(exception& e) {
791                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
792                 exit(1);
793         }
794 }
795
796 /**************************************************************************************************/
797
798 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
799         try {
800                 
801                 //set file positions for fasta file
802                 fastaFilePos = m->divideFile(filename, processors);
803                 
804                 if (qfilename == "") { return processors; }
805                 
806                 //get name of first sequence in each chunk
807                 map<string, int> firstSeqNames;
808                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
809                         ifstream in;
810                         m->openInputFile(filename, in);
811                         in.seekg(fastaFilePos[i]);
812                 
813                         Sequence temp(in); 
814                         firstSeqNames[temp.getName()] = i;
815                 
816                         in.close();
817                 }
818                                 
819                 //seach for filePos of each first name in the qfile and save in qfileFilePos
820                 ifstream inQual;
821                 m->openInputFile(qfilename, inQual);
822                 
823                 string input;
824                 while(!inQual.eof()){   
825                         input = m->getline(inQual);
826
827                         if (input.length() != 0) {
828                                 if(input[0] == '>'){ //this is a sequence name line
829                                         istringstream nameStream(input);
830                                         
831                                         string sname = "";  nameStream >> sname;
832                                         sname = sname.substr(1);
833                                         
834                                         map<string, int>::iterator it = firstSeqNames.find(sname);
835                                         
836                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
837                                                 unsigned long int pos = inQual.tellg(); 
838                                                 qfileFilePos.push_back(pos - input.length() - 1);       
839                                                 firstSeqNames.erase(it);
840                                         }
841                                 }
842                         }
843                         
844                         if (firstSeqNames.size() == 0) { break; }
845                 }
846                 inQual.close();
847                 
848                 
849                 if (firstSeqNames.size() != 0) { 
850                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
851                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
852                         }
853                         qFileName = "";
854                         return processors;
855                 }
856
857                 //get last file position of qfile
858                 FILE * pFile;
859                 unsigned long int size;
860                 
861                 //get num bytes in file
862                 pFile = fopen (qfilename.c_str(),"rb");
863                 if (pFile==NULL) perror ("Error opening file");
864                 else{
865                         fseek (pFile, 0, SEEK_END);
866                         size=ftell (pFile);
867                         fclose (pFile);
868                 }
869                 
870                 qfileFilePos.push_back(size);
871                 
872                 return processors;
873         }
874         catch(exception& e) {
875                 m->errorOut(e, "TrimSeqsCommand", "setLines");
876                 exit(1);
877         }
878 }
879
880 //***************************************************************************************************************
881
882 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
883         try {
884                 ifstream inOligos;
885                 m->openInputFile(oligoFile, inOligos);
886                 
887                 ofstream test;
888                 
889                 string type, oligo, group;
890
891                 int indexPrimer = 0;
892                 int indexBarcode = 0;
893                 
894                 while(!inOligos.eof()){
895
896                         inOligos >> type; m->gobble(inOligos);
897                                         
898                         if(type[0] == '#'){
899                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
900                         }
901                         else{
902                                 //make type case insensitive
903                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
904                                 
905                                 inOligos >> oligo;
906                                 
907                                 for(int i=0;i<oligo.length();i++){
908                                         oligo[i] = toupper(oligo[i]);
909                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
910                                 }
911                                 
912                                 if(type == "FORWARD"){
913                                         group = "";
914                                         
915                                         // get rest of line in case there is a primer name
916                                         while (!inOligos.eof()) {       
917                                                 char c = inOligos.get(); 
918                                                 if (c == 10 || c == 13){        break;  }
919                                                 else if (c == 32 || c == 9){;} //space or tab
920                                                 else {  group += c;  }
921                                         } 
922                                         
923                                         //check for repeat barcodes
924                                         map<string, int>::iterator itPrime = primers.find(oligo);
925                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
926                                         
927                                         primers[oligo]=indexPrimer; indexPrimer++;              
928                                         primerNameVector.push_back(group);
929                                 }
930                                 else if(type == "REVERSE"){
931                                         Sequence oligoRC("reverse", oligo);
932                                         oligoRC.reverseComplement();
933                                         revPrimer.push_back(oligoRC.getUnaligned());
934                                 }
935                                 else if(type == "BARCODE"){
936                                         inOligos >> group;
937                                         
938                                         //check for repeat barcodes
939                                         map<string, int>::iterator itBar = barcodes.find(oligo);
940                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
941                                                 
942                                         barcodes[oligo]=indexBarcode; indexBarcode++;
943                                         barcodeNameVector.push_back(group);
944                                 }
945                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
946                         }
947                         m->gobble(inOligos);
948                 }       
949                 inOligos.close();
950                 
951                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
952                 
953                 //add in potential combos
954                 if(barcodeNameVector.size() == 0){
955                         barcodes[""] = 0;
956                         barcodeNameVector.push_back("");                        
957                 }
958                 
959                 if(primerNameVector.size() == 0){
960                         primers[""] = 0;
961                         primerNameVector.push_back("");                 
962                 }
963                 
964                 fastaFileNames.resize(barcodeNameVector.size());
965                 for(int i=0;i<fastaFileNames.size();i++){
966                         fastaFileNames[i].assign(primerNameVector.size(), "");
967                 }
968                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
969                 
970                 if(allFiles){
971                         set<string> uniqueNames; //used to cleanup outputFileNames
972                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
973                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
974                                         
975                                         string primerName = primerNameVector[itPrimer->second];
976                                         string barcodeName = barcodeNameVector[itBar->second];
977                                         
978                                         string comboGroupName = "";
979                                         string fastaFileName = "";
980                                         string qualFileName = "";
981                                         
982                                         if(primerName == ""){
983                                                 comboGroupName = barcodeNameVector[itBar->second];
984                                         }
985                                         else{
986                                                 if(barcodeName == ""){
987                                                         comboGroupName = primerNameVector[itPrimer->second];
988                                                 }
989                                                 else{
990                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
991                                                 }
992                                         }
993
994                                         ofstream temp;
995                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
996                                         if (uniqueNames.count(fastaFileName) == 0) {
997                                                 outputNames.push_back(fastaFileName);
998                                                 outputTypes["fasta"].push_back(fastaFileName);
999                                                 uniqueNames.insert(fastaFileName);
1000                                         }
1001                                         
1002                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1003                                         m->openOutputFile(fastaFileName, temp);         temp.close();
1004
1005                                         if(qFileName != ""){
1006                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1007                                                 if (uniqueNames.count(fastaFileName) == 0) {
1008                                                         outputNames.push_back(qualFileName);
1009                                                         outputTypes["qfile"].push_back(qualFileName);
1010                                                 }
1011                                                 
1012                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1013                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1014                                         }
1015                                 }
1016                         }
1017                 }
1018                 numFPrimers = primers.size();
1019                 numRPrimers = revPrimer.size();
1020
1021         }
1022         catch(exception& e) {
1023                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1024                 exit(1);
1025         }
1026 }
1027
1028 //***************************************************************************************************************
1029
1030 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1031         try {
1032                 
1033                 string rawSequence = seq.getUnaligned();
1034                 int success = bdiffs + 1;       //guilty until proven innocent
1035                 
1036                 //can you find the barcode
1037                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1038                         string oligo = it->first;
1039                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1040                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1041                                 break;  
1042                         }
1043                         
1044                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1045                                 group = it->second;
1046                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1047                                 
1048                                 if(qual.getName() != ""){
1049                                         qual.trimQScores(oligo.length(), -1);
1050                                 }
1051                                 
1052                                 success = 0;
1053                                 break;
1054                         }
1055                 }
1056                 
1057                 //if you found the barcode or if you don't want to allow for diffs
1058                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1059                 
1060                 else { //try aligning and see if you can find it
1061
1062                         int maxLength = 0;
1063
1064                         Alignment* alignment;
1065                         if (barcodes.size() > 0) {
1066                                 map<string,int>::iterator it=barcodes.begin();
1067
1068                                 for(it;it!=barcodes.end();it++){
1069                                         if(it->first.length() > maxLength){
1070                                                 maxLength = it->first.length();
1071                                         }
1072                                 }
1073                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1074
1075                         }else{ alignment = NULL; } 
1076                         
1077                         //can you find the barcode
1078                         int minDiff = 1e6;
1079                         int minCount = 1;
1080                         int minGroup = -1;
1081                         int minPos = 0;
1082                         
1083                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1084                                 string oligo = it->first;
1085 //                              int length = oligo.length();
1086                                 
1087                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1088                                         success = bdiffs + 10;
1089                                         break;
1090                                 }
1091                                 
1092                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1093                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1094                                 oligo = alignment->getSeqAAln();
1095                                 string temp = alignment->getSeqBAln();
1096                 
1097                                 int alnLength = oligo.length();
1098                                 
1099                                 for(int i=oligo.length()-1;i>=0;i--){
1100                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1101                                 }
1102                                 oligo = oligo.substr(0,alnLength);
1103                                 temp = temp.substr(0,alnLength);
1104                                 
1105                                 int numDiff = countDiffs(oligo, temp);
1106                                 
1107                                 if(numDiff < minDiff){
1108                                         minDiff = numDiff;
1109                                         minCount = 1;
1110                                         minGroup = it->second;
1111                                         minPos = 0;
1112                                         for(int i=0;i<alnLength;i++){
1113                                                 if(temp[i] != '-'){
1114                                                         minPos++;
1115                                                 }
1116                                         }
1117                                 }
1118                                 else if(numDiff == minDiff){
1119                                         minCount++;
1120                                 }
1121
1122                         }
1123
1124                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1125                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1126                         else{                                                                                                   //use the best match
1127                                 group = minGroup;
1128                                 seq.setUnaligned(rawSequence.substr(minPos));
1129                                 
1130                                 if(qual.getName() != ""){
1131                                         qual.trimQScores(minPos, -1);
1132                                 }
1133                                 success = minDiff;
1134                         }
1135                         
1136                         if (alignment != NULL) {  delete alignment;  }
1137                         
1138                 }
1139                 
1140                 return success;
1141                 
1142         }
1143         catch(exception& e) {
1144                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1145                 exit(1);
1146         }
1147
1148 }
1149
1150 //***************************************************************************************************************
1151
1152 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1153         try {
1154                 string rawSequence = seq.getUnaligned();
1155                 int success = pdiffs + 1;       //guilty until proven innocent
1156                 
1157                 //can you find the primer
1158                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1159                         string oligo = it->first;
1160                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1161                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1162                                 break;  
1163                         }
1164                         
1165                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1166                                 group = it->second;
1167                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1168                                 if(qual.getName() != ""){
1169                                         qual.trimQScores(oligo.length(), -1);
1170                                 }
1171                                 success = 0;
1172                                 break;
1173                         }
1174                 }
1175
1176                 //if you found the barcode or if you don't want to allow for diffs
1177                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1178                 
1179                 else { //try aligning and see if you can find it
1180
1181                         int maxLength = 0;
1182
1183                         Alignment* alignment;
1184                         if (primers.size() > 0) {
1185                                 map<string,int>::iterator it=primers.begin();
1186
1187                                 for(it;it!=primers.end();it++){
1188                                         if(it->first.length() > maxLength){
1189                                                 maxLength = it->first.length();
1190                                         }
1191                                 }
1192                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1193
1194                         }else{ alignment = NULL; } 
1195                         
1196                         //can you find the barcode
1197                         int minDiff = 1e6;
1198                         int minCount = 1;
1199                         int minGroup = -1;
1200                         int minPos = 0;
1201                         
1202                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1203                                 string oligo = it->first;
1204 //                              int length = oligo.length();
1205                                 
1206                                 if(rawSequence.length() < maxLength){   
1207                                         success = pdiffs + 100;
1208                                         break;
1209                                 }
1210                                 
1211                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1212                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1213                                 oligo = alignment->getSeqAAln();
1214                                 string temp = alignment->getSeqBAln();
1215                 
1216                                 int alnLength = oligo.length();
1217                                 
1218                                 for(int i=oligo.length()-1;i>=0;i--){
1219                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1220                                 }
1221                                 oligo = oligo.substr(0,alnLength);
1222                                 temp = temp.substr(0,alnLength);
1223                                 
1224                                 int numDiff = countDiffs(oligo, temp);
1225                                 
1226                                 if(numDiff < minDiff){
1227                                         minDiff = numDiff;
1228                                         minCount = 1;
1229                                         minGroup = it->second;
1230                                         minPos = 0;
1231                                         for(int i=0;i<alnLength;i++){
1232                                                 if(temp[i] != '-'){
1233                                                         minPos++;
1234                                                 }
1235                                         }
1236                                 }
1237                                 else if(numDiff == minDiff){
1238                                         minCount++;
1239                                 }
1240
1241                         }
1242
1243                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1244                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1245                         else{                                                                                                   //use the best match
1246                                 group = minGroup;
1247                                 seq.setUnaligned(rawSequence.substr(minPos));
1248                                 if(qual.getName() != ""){
1249                                         qual.trimQScores(minPos, -1);
1250                                 }
1251                                 success = minDiff;
1252                         }
1253                         
1254                         if (alignment != NULL) {  delete alignment;  }
1255                         
1256                 }
1257                 
1258                 return success;
1259
1260         }
1261         catch(exception& e) {
1262                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1263                 exit(1);
1264         }
1265 }
1266
1267 //***************************************************************************************************************
1268
1269 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1270         try {
1271                 string rawSequence = seq.getUnaligned();
1272                 bool success = 0;       //guilty until proven innocent
1273                 
1274                 for(int i=0;i<numRPrimers;i++){
1275                         string oligo = revPrimer[i];
1276                         
1277                         if(rawSequence.length() < oligo.length()){
1278                                 success = 0;
1279                                 break;
1280                         }
1281                         
1282                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1283                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1284                                 if(qual.getName() != ""){
1285                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1286                                 }
1287                                 success = 1;
1288                                 break;
1289                         }
1290                 }       
1291                 return success;
1292                 
1293         }
1294         catch(exception& e) {
1295                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1296                 exit(1);
1297         }
1298 }
1299
1300 //***************************************************************************************************************
1301
1302 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1303         try {
1304                 bool success = 1;
1305                 if(qscores.getName() != ""){
1306                         qscores.trimQScores(-1, keepFirst);
1307                 }
1308                 sequence.trim(keepFirst);
1309                 return success;
1310         }
1311         catch(exception& e) {
1312                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1313                 exit(1);
1314         }
1315         
1316 }       
1317
1318 //***************************************************************************************************************
1319
1320 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1321         try {
1322                 bool success = 0;
1323                 
1324                 int length = sequence.getNumBases() - removeLast;
1325                 
1326                 if(length > 0){
1327                         if(qscores.getName() != ""){
1328                                 qscores.trimQScores(-1, length);
1329                         }
1330                         sequence.trim(length);
1331                         success = 1;
1332                 }
1333                 else{
1334                         success = 0;
1335                 }
1336
1337                 return success;
1338         }
1339         catch(exception& e) {
1340                 m->errorOut(e, "removeLastTrim", "countDiffs");
1341                 exit(1);
1342         }
1343         
1344 }       
1345
1346 //***************************************************************************************************************
1347
1348 bool TrimSeqsCommand::cullLength(Sequence& seq){
1349         try {
1350         
1351                 int length = seq.getNumBases();
1352                 bool success = 0;       //guilty until proven innocent
1353                 
1354                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1355                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1356                 else                                                                                            {       success = 0;    }
1357                 
1358                 return success;
1359         
1360         }
1361         catch(exception& e) {
1362                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1363                 exit(1);
1364         }
1365         
1366 }
1367
1368 //***************************************************************************************************************
1369
1370 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1371         try {
1372                 int longHomoP = seq.getLongHomoPolymer();
1373                 bool success = 0;       //guilty until proven innocent
1374                 
1375                 if(longHomoP <= maxHomoP){      success = 1;    }
1376                 else                                    {       success = 0;    }
1377                 
1378                 return success;
1379         }
1380         catch(exception& e) {
1381                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1382                 exit(1);
1383         }
1384         
1385 }
1386
1387 //***************************************************************************************************************
1388
1389 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1390         try {
1391                 int numNs = seq.getAmbigBases();
1392                 bool success = 0;       //guilty until proven innocent
1393                 
1394                 if(numNs <= maxAmbig)   {       success = 1;    }
1395                 else                                    {       success = 0;    }
1396                 
1397                 return success;
1398         }
1399         catch(exception& e) {
1400                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1401                 exit(1);
1402         }
1403         
1404 }
1405
1406 //***************************************************************************************************************
1407
1408 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1409         try {
1410                 bool success = 1;
1411                 int length = oligo.length();
1412                 
1413                 for(int i=0;i<length;i++){
1414                         
1415                         if(oligo[i] != seq[i]){
1416                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1417                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1418                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1419                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1420                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1421                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1422                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1423                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1424                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1425                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1426                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1427                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1428                                 
1429                                 if(success == 0)        {       break;   }
1430                         }
1431                         else{
1432                                 success = 1;
1433                         }
1434                 }
1435                 
1436                 return success;
1437         }
1438         catch(exception& e) {
1439                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1440                 exit(1);
1441         }
1442
1443 }
1444
1445 //***************************************************************************************************************
1446
1447 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1448         try {
1449
1450                 int length = oligo.length();
1451                 int countDiffs = 0;
1452                 
1453                 for(int i=0;i<length;i++){
1454                                                                 
1455                         if(oligo[i] != seq[i]){
1456                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1457                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1458                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1459                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1460                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1461                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1462                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1463                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1464                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1465                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1466                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1467                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1468                         }
1469                         
1470                 }
1471                 
1472                 return countDiffs;
1473         }
1474         catch(exception& e) {
1475                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1476                 exit(1);
1477         }
1478
1479 }
1480
1481 //***************************************************************************************************************