]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
cluster.split fix
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::getValidParameters(){   
15         try {
16                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
17                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
18                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
19                 return myArray;
20         }
21         catch(exception& e) {
22                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
23                 exit(1);
24         }
25 }
26 //**********************************************************************************************************************
27 TrimSeqsCommand::TrimSeqsCommand(){     
28         try {
29                 abort = true;
30                 //initialize outputTypes
31                 vector<string> tempOutNames;
32                 outputTypes["fasta"] = tempOutNames;
33                 outputTypes["qual"] = tempOutNames;
34                 outputTypes["group"] = tempOutNames;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 vector<string> TrimSeqsCommand::getRequiredParameters(){        
43         try {
44                 string Array[] =  {"fasta"};
45                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 vector<string> TrimSeqsCommand::getRequiredFiles(){     
55         try {
56                 vector<string> myArray;
57                 return myArray;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
61                 exit(1);
62         }
63 }
64 //***************************************************************************************************************
65
66 TrimSeqsCommand::TrimSeqsCommand(string option)  {
67         try {
68                 
69                 abort = false;
70                 comboStarts = 0;
71                 
72                 //allow user to run help
73                 if(option == "help") { help(); abort = true; }
74                 
75                 else {
76                         //valid paramters for this command
77                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
78                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
79                         
80                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
81                         
82                         OptionParser parser(option);
83                         map<string,string> parameters = parser.getParameters();
84                         
85                         ValidParameters validParameter;
86                         map<string,string>::iterator it;
87                         
88                         //check to make sure all parameters are valid for command
89                         for (it = parameters.begin(); it != parameters.end(); it++) { 
90                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
91                         }
92                         
93                         //initialize outputTypes
94                         vector<string> tempOutNames;
95                         outputTypes["fasta"] = tempOutNames;
96                         outputTypes["qual"] = tempOutNames;
97                         outputTypes["group"] = tempOutNames;
98                         
99                         //if the user changes the input directory command factory will send this info to us in the output parameter 
100                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
101                         if (inputDir == "not found"){   inputDir = "";          }
102                         else {
103                                 string path;
104                                 it = parameters.find("fasta");
105                                 //user has given a template file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
110                                 }
111                                 
112                                 it = parameters.find("oligos");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
118                                 }
119                                 
120                                 it = parameters.find("qfile");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
126                                 }
127                         }
128
129                         
130                         //check for required parameters
131                         fastaFile = validParameter.validFile(parameters, "fasta", true);
132                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
133                         else if (fastaFile == "not open") { abort = true; }     
134                         
135                         //if the user changes the output directory command factory will send this info to us in the output parameter 
136                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
137                                 outputDir = ""; 
138                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
139                         }
140                 
141                         
142                         //check for optional parameter and set defaults
143                         // ...at some point should added some additional type checking...
144                         string temp;
145                         temp = validParameter.validFile(parameters, "flip", false);
146                         if (temp == "not found"){       flip = 0;       }
147                         else if(m->isTrue(temp))        {       flip = 1;       }
148                 
149                         temp = validParameter.validFile(parameters, "oligos", true);
150                         if (temp == "not found"){       oligoFile = "";         }
151                         else if(temp == "not open"){    abort = true;   } 
152                         else                                    {       oligoFile = temp;               }
153                         
154                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
155                         convert(temp, maxAmbig);  
156
157                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
158                         convert(temp, maxHomoP);  
159
160                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
161                         convert(temp, minLength); 
162                         
163                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
164                         convert(temp, maxLength);
165                         
166                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
167                         convert(temp, bdiffs);
168                         
169                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
170                         convert(temp, pdiffs);
171                         
172                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
173                         convert(temp, tdiffs);
174                         
175                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
176                         
177                         temp = validParameter.validFile(parameters, "qfile", true);     
178                         if (temp == "not found")        {       qFileName = "";         }
179                         else if(temp == "not open")     {       abort = true;           }
180                         else                                            {       qFileName = temp;       }
181                         
182                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
183                         convert(temp, qThreshold);
184                         
185                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "F"; }
186                         qtrim = m->isTrue(temp);
187
188                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
189                         convert(temp, qRollAverage);
190
191                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
192                         convert(temp, qWindowAverage);
193
194                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "100"; }
195                         convert(temp, qWindowSize);
196
197                         temp = validParameter.validFile(parameters, "qstepsize", false);                if (temp == "not found") { temp = "10"; }
198                         convert(temp, qWindowStep);
199
200                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
201                         convert(temp, qAverage);
202                         
203                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
204                         allFiles = m->isTrue(temp);
205                         
206                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
207                         convert(temp, processors); 
208                         
209                         if(allFiles && oligoFile == ""){
210                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
211                         }
212                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
213                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
214                                 qAverage=0;
215                                 qThreshold=0;
216                         }
217                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
218                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
219                                 abort = true;
220                         }
221                 }
222
223         }
224         catch(exception& e) {
225                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
226                 exit(1);
227         }
228 }
229 //**********************************************************************************************************************
230
231 void TrimSeqsCommand::help(){
232         try {
233                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
234                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
235                 m->mothurOut("The fasta parameter is required.\n");
236                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
237                 m->mothurOut("The oligos parameter .... The default is "".\n");
238                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
239                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
240                 m->mothurOut("The minlength parameter .... The default is 0.\n");
241                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
242                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
243                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
244                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
245                 m->mothurOut("The qfile parameter .....\n");
246                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
247                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
248                 m->mothurOut("The allfiles parameter .... The default is F.\n");
249                 m->mothurOut("The qtrim parameter .... The default is F.\n");
250                 m->mothurOut("The trim.seqs command should be in the following format: \n");
251                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
252                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
253                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
254                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
255                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
256
257         }
258         catch(exception& e) {
259                 m->errorOut(e, "TrimSeqsCommand", "help");
260                 exit(1);
261         }
262 }
263
264
265 //***************************************************************************************************************
266
267 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
268
269 //***************************************************************************************************************
270
271 int TrimSeqsCommand::execute(){
272         try{
273         
274                 if (abort == true) { return 0; }
275                 
276                 numFPrimers = 0;  //this needs to be initialized
277                 numRPrimers = 0;
278                 
279                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
280                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
281                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
282                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
283                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
284                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
285                 if (qFileName != "") {  outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile);  outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); }
286                 string groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
287                 
288                 vector<string> fastaFileNames;
289                 vector<string> qualFileNames;
290                 if(oligoFile != ""){
291                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
292                         getOligos(fastaFileNames, qualFileNames);
293                 }
294
295                 vector<unsigned long int> fastaFilePos;
296                 vector<unsigned long int> qFilePos;
297                 
298                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
299                 
300                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
301                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
302                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
303                 }       
304                 if(qFileName == "")     {       qLines = lines; } //files with duds
305                 
306                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
307                                 if(processors == 1){
308                                                                                 
309                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
310                                         
311                                         for (int j = 0; j < fastaFileNames.size(); j++) {
312                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
313                                         }
314                                         if(qFileName != ""){
315                                                 for (int j = 0; j < qualFileNames.size(); j++) {
316                                                         rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str());
317                                                 }
318                                         }                                               
319
320                                 }else{
321                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); 
322                                         
323                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
324                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
325                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
326                                         
327                                         if(qFileName != ""){
328                                                 rename((trimQualFile + toString(processIDS[0]) + ".temp").c_str(), trimQualFile.c_str());
329                                                 rename((scrapQualFile + toString(processIDS[0]) + ".temp").c_str(), scrapQualFile.c_str());
330                                         }
331                                         
332                                         
333                                         for (int j = 0; j < fastaFileNames.size(); j++) {
334                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
335                                         }
336                                         if(qFileName != ""){
337                                                 for (int j = 0; j < qualFileNames.size(); j++) {
338                                                         rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str());
339                                                 }
340                                         }                                               
341                                         
342                                         //append files
343                                         for(int i=1;i<processors;i++){
344                                                 m->appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
345                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
346                                                 m->appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
347                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
348
349                                                 m->appendFiles((trimQualFile + toString(processIDS[i]) + ".temp"), trimQualFile);
350                                                 remove((trimQualFile + toString(processIDS[i]) + ".temp").c_str());
351                                                 m->appendFiles((scrapQualFile + toString(processIDS[i]) + ".temp"), scrapQualFile);
352                                                 remove((scrapQualFile + toString(processIDS[i]) + ".temp").c_str());
353                                                 
354                                                 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
355                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
356                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
357                                                         m->appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
358                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
359                                                 }
360                                                 
361                                                 if(qFileName != ""){
362                                                         for (int j = 0; j < qualFileNames.size(); j++) {
363                                                                 m->appendFiles((qualFileNames[j] + toString(processIDS[i]) + ".temp"), qualFileNames[j]);
364                                                                 remove((qualFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
365                                                         }
366                                                 }                                               
367                                                 
368                                                 
369                                         }
370                                 }
371                                 
372                                 if (m->control_pressed) {  return 0; }
373                 #else
374                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
375                                 
376                                 for (int j = 0; j < fastaFileNames.size(); j++) {
377                                         rename((fastaFileNames[j] + toString(j) + ".temp").c_str(), fastaFileNames[j].c_str());
378                                 }
379                                 if(qFileName != ""){
380                                         for (int j = 0; j < qualFileNames.size(); j++) {
381                                                 rename((qualFileNames[j] + toString(j) + ".temp").c_str(), qualFileNames[j].c_str());
382                                         }
383                                 }
384                                                                         
385                                 if (m->control_pressed) {  return 0; }
386                 #endif
387                                                 
388                                                                                 
389                 for(int i=0;i<fastaFileNames.size();i++){
390                         if (m->isBlank(fastaFileNames[i])) { remove(fastaFileNames[i].c_str()); }
391                         else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); }
392                         else {
393                                 ifstream inFASTA;
394                                 string seqName;
395                                 m->openInputFile(fastaFileNames[i], inFASTA);
396                                 ofstream outGroups;
397                                 string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups";
398                                 m->openOutputFile(outGroupFilename, outGroups);
399                                 outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename);  
400                                 
401                                 string thisGroup = "";
402                                 if (i > comboStarts) {
403                                         map<string, int>::iterator itCombo;
404                                         for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
405                                                 if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
406                                         }
407                                 }else{ thisGroup = groupVector[i]; }
408                                 
409                                 while(!inFASTA.eof()){
410                                         if(inFASTA.get() == '>'){
411                                                 inFASTA >> seqName;
412                                                 outGroups << seqName << '\t' << thisGroup << endl;
413                                         }
414                                         while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
415                                 }
416                                 outGroups.close();
417                                 inFASTA.close();
418                         }
419                 }
420                 
421                 if(qFileName != ""){
422                         for(int i=0;i<qualFileNames.size();i++){
423                                 if (m->isBlank(qualFileNames[i])) { remove(qualFileNames[i].c_str());   }
424                                 else if (filesToRemove.count(qualFileNames[i]) > 0) { remove(qualFileNames[i].c_str()); }
425                                 else {
426                                         ifstream inQual;
427                                         string seqName;
428                                         m->openInputFile(qualFileNames[i], inQual);
429 //                                      ofstream outGroups;
430 //                                      
431 //                                      string thisGroup = "";
432 //                                      if (i > comboStarts) {
433 //                                              map<string, int>::iterator itCombo;
434 //                                              for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
435 //                                                      if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
436 //                                              }
437 //                                      }
438 //                                      else{ thisGroup = groupVector[i]; }
439                                         
440                                         inQual.close();
441                                 }
442                         }
443                 }
444                 
445                 
446                 if (m->control_pressed) { 
447                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
448                         return 0;
449                 }
450
451                 m->mothurOutEndLine();
452                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
453                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
454                 m->mothurOutEndLine();
455                 
456                 return 0;       
457                         
458         }
459         catch(exception& e) {
460                 m->errorOut(e, "TrimSeqsCommand", "execute");
461                 exit(1);
462         }
463 }
464                 
465 /**************************************************************************************/
466
467 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames, linePair* line, linePair* qline) {      
468                 
469         try {
470                 
471                 ofstream outFASTA;
472                 int able = m->openOutputFile(trimFile, outFASTA);
473                 
474                 ofstream scrapFASTA;
475                 m->openOutputFile(scrapFile, scrapFASTA);
476                 
477                 ofstream outQual;
478                 ofstream scrapQual;
479                 if(qFileName != ""){
480                         m->openOutputFile(trimQFile, outQual);
481                         m->openOutputFile(scrapQFile, scrapQual);
482                 }
483                 
484                 ofstream outGroups;
485                 //vector<ofstream*> fastaFileNames;
486                 //vector<ofstream*> qualFileNames;
487                 
488                 if (oligoFile != "") {          
489                         m->openOutputFile(groupFile, outGroups);   
490                         for (int i = 0; i < fastaNames.size(); i++) {
491
492                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
493                                 fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp");
494                                 //fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
495                                 //clear old file if it exists
496                                 ofstream temp;
497                                 m->openOutputFile(fastaNames[i], temp);
498                                 temp.close();
499                                 if(qFileName != ""){
500                                         qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp");
501                                         //qualFileNames.push_back(new ofstream((qualNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
502                                         //clear old file if it exists
503                                         ofstream temp2;
504                                         m->openOutputFile(qualNames[i], temp2);
505                                         temp2.close();
506                                 }
507                         #else
508                                 //fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); 
509                                 fastaNames[i] = (fastaNames[i] + toString(i) + ".temp");
510                                 ofstream temp;
511                                 m->openOutputFile(fastaNames[i], temp);
512                                 temp.close();                   
513                                 if(qFileName != ""){
514                                         //qualFileNames.push_back(new ofstream((qualNames[i] + toString(i) + ".temp").c_str(), ios::ate));      
515                                         qualNames[i] = (qualNames[i] + toString(i) + ".temp");
516                                         ofstream temp2;
517                                         m->openOutputFile(qualNames[i], temp2);
518                                         temp2.close();          
519                                 }
520                         #endif
521                         }
522                 }
523                 
524                 ifstream inFASTA;
525                 m->openInputFile(filename, inFASTA);
526                 inFASTA.seekg(line->start);
527                 
528                 ifstream qFile;
529                 if(qFileName != "")     {       m->openInputFile(qFileName, qFile);     qFile.seekg(qline->start);  }
530                 
531                 bool done = false;
532                 int count = 0;
533         
534                 while (!done) {
535                                 
536                         if (m->control_pressed) { 
537                                 inFASTA.close(); outFASTA.close(); scrapFASTA.close();
538                                 if (oligoFile != "") {   outGroups.close();   }
539                                 
540                                 //for(int i=0;i<fastaFileNames.size();i++){  fastaFileNames[i]->close(); delete fastaFileNames[i];  }   
541
542                                 if(qFileName != ""){
543                                         qFile.close();
544                                         //for(int i=0;i<qualFileNames.size();i++){  qualFileNames[i]->close(); delete qualFileNames[i];  }      
545                                 }
546                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
547
548                                 return 0;
549                         }
550                         
551                         int success = 1;
552                         
553
554                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
555
556                         QualityScores currQual;
557                         if(qFileName != ""){
558                                 currQual = QualityScores(qFile, currSeq.getNumBases());  m->gobble(qFile);
559                         }
560                         
561                         string origSeq = currSeq.getUnaligned();
562                         if (origSeq != "") {
563                                 int groupBar, groupPrime;
564                                 string trashCode = "";
565                                 int currentSeqsDiffs = 0;
566
567                                 if(qFileName != ""){
568                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
569                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
570                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
571                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
572
573                                         if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
574                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
575                                         }
576
577                                         if(!success)                            {       trashCode += 'q';       }
578                                 }
579                         
580                                 if(barcodes.size() != 0){
581                                         success = stripBarcode(currSeq, currQual, groupBar);
582                                         if(success > bdiffs)            {       trashCode += 'b';       }
583                                         else{ currentSeqsDiffs += success;  }
584                                 }
585
586                                 if(numFPrimers != 0){
587                                         success = stripForward(currSeq, currQual, groupPrime);
588                                         if(success > pdiffs)            {       trashCode += 'f';       }
589                                         else{ currentSeqsDiffs += success;  }
590                                 }
591                                 
592                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
593
594                                 if(numRPrimers != 0){
595                                         success = stripReverse(currSeq, currQual);
596                                         if(!success)                            {       trashCode += 'r';       }
597                                 }
598                 
599                                 if(minLength > 0 || maxLength > 0){
600                                         success = cullLength(currSeq);
601                                         if(!success)                            {       trashCode += 'l';       }
602                                 }
603                                 if(maxHomoP > 0){
604                                         success = cullHomoP(currSeq);
605                                         if(!success)                            {       trashCode += 'h';       }
606                                 }
607                                 if(maxAmbig != -1){
608                                         success = cullAmbigs(currSeq);
609                                         if(!success)                            {       trashCode += 'n';       }
610                                 }
611                                 
612                                 if(flip){       currSeq.reverseComplement();            currQual.flipQScores(); }               // should go last                       
613                                 
614                                 if(trashCode.length() == 0){
615                                         currSeq.setAligned(currSeq.getUnaligned());
616                                         currSeq.printSequence(outFASTA);
617                                         currQual.printQScores(outQual);
618                                         
619                                         if(barcodes.size() != 0){
620                                                 string thisGroup = groupVector[groupBar];
621                                                 int indexToFastaFile = groupBar;
622                                                 if (primers.size() != 0){
623                                                         //does this primer have a group
624                                                         if (groupVector[groupPrime] != "") {  
625                                                                 thisGroup += "." + groupVector[groupPrime]; 
626                                                                 indexToFastaFile = combos[thisGroup];
627                                                         }
628                                                 }
629                                                 outGroups << currSeq.getName() << '\t' << thisGroup << endl;
630                                                 if(allFiles){
631                                                         ofstream outTemp;
632                                                         m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp);
633                                                         //currSeq.printSequence(*fastaFileNames[indexToFastaFile]);
634                                                         currSeq.printSequence(outTemp);
635                                                         outTemp.close();
636                                                         
637                                                         if(qFileName != ""){
638                                                                 //currQual.printQScores(*qualFileNames[indexToFastaFile]);
639                                                                 ofstream outTemp2;
640                                                                 m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2);
641                                                                 currQual.printQScores(outTemp2);
642                                                                 outTemp2.close();                                                       
643                                                         }
644                                                 }
645                                         }
646                                 }
647                                 else{
648                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
649                                         currSeq.setUnaligned(origSeq);
650                                         currSeq.setAligned(origSeq);
651                                         currSeq.printSequence(scrapFASTA);
652                                         currQual.printQScores(scrapQual);
653                                 }
654                                 count++;
655                         }
656                         
657                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
658                                 unsigned long int pos = inFASTA.tellg();
659                                 if ((pos == -1) || (pos >= line->end)) { break; }
660                         #else
661                                 if (inFASTA.eof()) { break; }
662                         #endif
663                                 
664                         //report progress
665                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
666                         
667                 }
668                 //report progress
669                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
670
671                 
672                 inFASTA.close();
673                 outFASTA.close();
674                 scrapFASTA.close();
675                 if (oligoFile != "") {   outGroups.close();   }
676                 if(qFileName != "")     {       qFile.close();  scrapQual.close(); outQual.close();     }
677                 
678                 //for(int i=0;i<fastaFileNames.size();i++){
679                 //      fastaFileNames[i]->close();
680                 //      delete fastaFileNames[i];
681                 //}             
682                 
683                 //if(qFileName != ""){
684                         //for(int i=0;i<qualFileNames.size();i++){
685                                 //qualFileNames[i]->close();
686                                 //delete qualFileNames[i];
687                         //}             
688                 //}                     
689                 
690                 return count;
691         }
692         catch(exception& e) {
693                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
694                 exit(1);
695         }
696 }
697
698 /**************************************************************************************************/
699
700 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames) {
701         try {
702 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
703                 int process = 0;
704                 int exitCommand = 1;
705                 processIDS.clear();
706                 
707                 //loop through and create all the processes you want
708                 while (process != processors) {
709                         int pid = fork();
710                         
711                         if (pid > 0) {
712                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
713                                 process++;
714                         }else if (pid == 0){
715                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]);
716                                 exit(0);
717                         }else { 
718                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
719                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
720                                 exit(0);
721                         }
722                 }
723                 
724                 //force parent to wait until all the processes are done
725                 for (int i=0;i<processors;i++) { 
726                         int temp = processIDS[i];
727                         wait(&temp);
728                 }
729                 
730                 return exitCommand;
731 #endif          
732         }
733         catch(exception& e) {
734                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
735                 exit(1);
736         }
737 }
738
739 /**************************************************************************************************/
740
741 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
742         try {
743                 
744                 //set file positions for fasta file
745                 fastaFilePos = m->divideFile(filename, processors);
746                 
747                 if (qfilename == "") { return processors; }
748                 
749                 //get name of first sequence in each chunk
750                 map<string, int> firstSeqNames;
751                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
752                         ifstream in;
753                         m->openInputFile(filename, in);
754                         in.seekg(fastaFilePos[i]);
755                 
756                         Sequence temp(in); 
757                         firstSeqNames[temp.getName()] = i;
758                 
759                         in.close();
760                 }
761                                 
762                 //seach for filePos of each first name in the qfile and save in qfileFilePos
763                 ifstream inQual;
764                 m->openInputFile(qfilename, inQual);
765                 
766                 string input;
767                 while(!inQual.eof()){   
768                         input = m->getline(inQual);
769
770                         if (input.length() != 0) {
771                                 if(input[0] == '>'){ //this is a sequence name line
772                                         istringstream nameStream(input);
773                                         
774                                         string sname = "";  nameStream >> sname;
775                                         sname = sname.substr(1);
776                                         
777                                         map<string, int>::iterator it = firstSeqNames.find(sname);
778                                         
779                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
780                                                 unsigned long int pos = inQual.tellg(); 
781                                                 qfileFilePos.push_back(pos - input.length() - 1);       
782                                                 firstSeqNames.erase(it);
783                                         }
784                                 }
785                         }
786                         
787                         if (firstSeqNames.size() == 0) { break; }
788                 }
789                 inQual.close();
790                 
791                 
792                 if (firstSeqNames.size() != 0) { 
793                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
794                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
795                         }
796                         qFileName = "";
797                         return processors;
798                 }
799
800                 //get last file position of qfile
801                 FILE * pFile;
802                 unsigned long int size;
803                 
804                 //get num bytes in file
805                 pFile = fopen (qfilename.c_str(),"rb");
806                 if (pFile==NULL) perror ("Error opening file");
807                 else{
808                         fseek (pFile, 0, SEEK_END);
809                         size=ftell (pFile);
810                         fclose (pFile);
811                 }
812                 
813                 qfileFilePos.push_back(size);
814                 
815                 return processors;
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "TrimSeqsCommand", "setLines");
819                 exit(1);
820         }
821 }
822 //***************************************************************************************************************
823
824 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec, vector<string>& outQualVec){
825         try {
826                 ifstream inOligos;
827                 m->openInputFile(oligoFile, inOligos);
828                 
829                 ofstream test;
830                 
831                 string type, oligo, group;
832                 int index=0;
833                 //int indexPrimer = 0;
834                 
835                 while(!inOligos.eof()){
836                         inOligos >> type;
837                                         
838                         if(type[0] == '#'){
839                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
840                         }
841                         else{
842                                 //make type case insensitive
843                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
844                                 
845                                 inOligos >> oligo;
846                                 
847                                 for(int i=0;i<oligo.length();i++){
848                                         oligo[i] = toupper(oligo[i]);
849                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
850                                 }
851                                 
852                                 if(type == "FORWARD"){
853                                         group = "";
854                                         
855                                         // get rest of line in case there is a primer name
856                                         while (!inOligos.eof()) {       
857                                                 char c = inOligos.get(); 
858                                                 if (c == 10 || c == 13){        break;  }
859                                                 else if (c == 32 || c == 9){;} //space or tab
860                                                 else {  group += c;  }
861                                         } 
862                                         
863                                         //check for repeat barcodes
864                                         map<string, int>::iterator itPrime = primers.find(oligo);
865                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
866                                         
867                                         primers[oligo]=index; index++;
868                                         groupVector.push_back(group);
869                                         
870                                         if(allFiles){
871                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
872                                                 if(qFileName != ""){
873                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
874                                                 }
875                                                 if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct
876                                                         filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
877                                                         if(qFileName != ""){
878                                                                 filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
879                                                         }
880                                                 }else {
881                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
882                                                         outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
883                                                         if(qFileName != ""){
884                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
885                                                                 outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
886                                                         }                                                       
887                                                 }
888                                         }
889
890                                 }
891                                 else if(type == "REVERSE"){
892                                         Sequence oligoRC("reverse", oligo);
893                                         oligoRC.reverseComplement();
894                                         revPrimer.push_back(oligoRC.getUnaligned());
895                                 }
896                                 else if(type == "BARCODE"){
897                                         inOligos >> group;
898                                         
899                                         //check for repeat barcodes
900                                         map<string, int>::iterator itBar = barcodes.find(oligo);
901                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
902                                         
903                                         barcodes[oligo]=index; index++;
904                                         groupVector.push_back(group);
905                                         
906                                         if(allFiles){
907                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
908                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
909                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
910                                                 if(qFileName != ""){
911                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
912                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
913                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
914                                                 }                                                       
915                                         }
916                                 }else{  m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
917                         }
918                         m->gobble(inOligos);
919                 }
920                 
921                 inOligos.close();
922                 
923                 //add in potential combos
924                 if(allFiles){
925                         comboStarts = outFASTAVec.size()-1;
926                         for (map<string, int>::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) {
927                                 for (map<string, int>::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) {
928                                         if (groupVector[itPrime->second] != "") { //there is a group for this primer
929                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
930                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
931                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
932                                                 combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1;
933                                                 
934                                                 if(qFileName != ""){
935                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
936                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
937                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
938                                                 }
939                                         }
940                                 }
941                         }
942                 }
943                 
944                 numFPrimers = primers.size();
945                 numRPrimers = revPrimer.size();
946                 
947         }
948         catch(exception& e) {
949                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
950                 exit(1);
951         }
952 }
953 //***************************************************************************************************************
954
955 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
956         try {
957                 
958                 string rawSequence = seq.getUnaligned();
959                 int success = bdiffs + 1;       //guilty until proven innocent
960                 
961                 //can you find the barcode
962                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
963                         string oligo = it->first;
964                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
965                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
966                                 break;  
967                         }
968                         
969                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
970                                 group = it->second;
971                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
972                                 
973                                 if(qual.getName() != ""){
974                                         qual.trimQScores(oligo.length(), -1);
975                                 }
976                                 
977                                 success = 0;
978                                 break;
979                         }
980                 }
981                 
982                 //if you found the barcode or if you don't want to allow for diffs
983 //              cout << success;
984                 if ((bdiffs == 0) || (success == 0)) { return success;  }
985                 
986                 else { //try aligning and see if you can find it
987 //                      cout << endl;
988
989                         int maxLength = 0;
990
991                         Alignment* alignment;
992                         if (barcodes.size() > 0) {
993                                 map<string,int>::iterator it=barcodes.begin();
994
995                                 for(it;it!=barcodes.end();it++){
996                                         if(it->first.length() > maxLength){
997                                                 maxLength = it->first.length();
998                                         }
999                                 }
1000                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1001
1002                         }else{ alignment = NULL; } 
1003                         
1004                         //can you find the barcode
1005                         int minDiff = 1e6;
1006                         int minCount = 1;
1007                         int minGroup = -1;
1008                         int minPos = 0;
1009                         
1010                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1011                                 string oligo = it->first;
1012 //                              int length = oligo.length();
1013                                 
1014                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1015                                         success = bdiffs + 10;
1016                                         break;
1017                                 }
1018                                 
1019                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1020                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1021                                 oligo = alignment->getSeqAAln();
1022                                 string temp = alignment->getSeqBAln();
1023                 
1024                                 int alnLength = oligo.length();
1025                                 
1026                                 for(int i=oligo.length()-1;i>=0;i--){
1027                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1028                                 }
1029                                 oligo = oligo.substr(0,alnLength);
1030                                 temp = temp.substr(0,alnLength);
1031                                 
1032                                 int newStart=0;
1033                                 int numDiff = countDiffs(oligo, temp);
1034                                 
1035 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1036                                 
1037                                 if(numDiff < minDiff){
1038                                         minDiff = numDiff;
1039                                         minCount = 1;
1040                                         minGroup = it->second;
1041                                         minPos = 0;
1042                                         for(int i=0;i<alnLength;i++){
1043                                                 if(temp[i] != '-'){
1044                                                         minPos++;
1045                                                 }
1046                                         }
1047                                 }
1048                                 else if(numDiff == minDiff){
1049                                         minCount++;
1050                                 }
1051
1052                         }
1053
1054                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1055                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1056                         else{                                                                                                   //use the best match
1057                                 group = minGroup;
1058                                 seq.setUnaligned(rawSequence.substr(minPos));
1059                                 
1060                                 if(qual.getName() != ""){
1061                                         qual.trimQScores(minPos, -1);
1062                                 }
1063                                 success = minDiff;
1064                         }
1065                         
1066                         if (alignment != NULL) {  delete alignment;  }
1067                         
1068                 }
1069 //              cout << success << endl;
1070                 
1071                 return success;
1072                 
1073         }
1074         catch(exception& e) {
1075                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1076                 exit(1);
1077         }
1078
1079 }
1080
1081 //***************************************************************************************************************
1082
1083 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1084         try {
1085                 string rawSequence = seq.getUnaligned();
1086                 int success = pdiffs + 1;       //guilty until proven innocent
1087                 
1088                 //can you find the primer
1089                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1090                         string oligo = it->first;
1091                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1092                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1093                                 break;  
1094                         }
1095                         
1096                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1097                                 group = it->second;
1098                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1099                                 if(qual.getName() != ""){
1100                                         qual.trimQScores(oligo.length(), -1);
1101                                         
1102                                 }
1103                                 success = 0;
1104                                 break;
1105                         }
1106                 }
1107
1108                 //if you found the barcode or if you don't want to allow for diffs
1109 //              cout << success;
1110                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1111                 
1112                 else { //try aligning and see if you can find it
1113 //                      cout << endl;
1114
1115                         int maxLength = 0;
1116
1117                         Alignment* alignment;
1118                         if (primers.size() > 0) {
1119                                 map<string,int>::iterator it=primers.begin();
1120
1121                                 for(it;it!=primers.end();it++){
1122                                         if(it->first.length() > maxLength){
1123                                                 maxLength = it->first.length();
1124                                         }
1125                                 }
1126                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1127
1128                         }else{ alignment = NULL; } 
1129                         
1130                         //can you find the barcode
1131                         int minDiff = 1e6;
1132                         int minCount = 1;
1133                         int minGroup = -1;
1134                         int minPos = 0;
1135                         
1136                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1137                                 string oligo = it->first;
1138 //                              int length = oligo.length();
1139                                 
1140                                 if(rawSequence.length() < maxLength){   
1141                                         success = pdiffs + 100;
1142                                         break;
1143                                 }
1144                                 
1145                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1146                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1147                                 oligo = alignment->getSeqAAln();
1148                                 string temp = alignment->getSeqBAln();
1149                 
1150                                 int alnLength = oligo.length();
1151                                 
1152                                 for(int i=oligo.length()-1;i>=0;i--){
1153                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1154                                 }
1155                                 oligo = oligo.substr(0,alnLength);
1156                                 temp = temp.substr(0,alnLength);
1157                                 
1158                                 int newStart=0;
1159                                 int numDiff = countDiffs(oligo, temp);
1160                                 
1161 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1162                                 
1163                                 if(numDiff < minDiff){
1164                                         minDiff = numDiff;
1165                                         minCount = 1;
1166                                         minGroup = it->second;
1167                                         minPos = 0;
1168                                         for(int i=0;i<alnLength;i++){
1169                                                 if(temp[i] != '-'){
1170                                                         minPos++;
1171                                                 }
1172                                         }
1173                                 }
1174                                 else if(numDiff == minDiff){
1175                                         minCount++;
1176                                 }
1177
1178                         }
1179
1180                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1181                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1182                         else{                                                                                                   //use the best match
1183                                 group = minGroup;
1184                                 seq.setUnaligned(rawSequence.substr(minPos));
1185                                 if(qual.getName() != ""){
1186                                         qual.trimQScores(minPos, -1);
1187                                 }
1188                                 success = minDiff;
1189                         }
1190                         
1191                         if (alignment != NULL) {  delete alignment;  }
1192                         
1193                 }
1194                 
1195                 return success;
1196
1197         }
1198         catch(exception& e) {
1199                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1200                 exit(1);
1201         }
1202 }
1203
1204 //***************************************************************************************************************
1205
1206 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1207         try {
1208                 string rawSequence = seq.getUnaligned();
1209                 bool success = 0;       //guilty until proven innocent
1210                 
1211                 for(int i=0;i<numRPrimers;i++){
1212                         string oligo = revPrimer[i];
1213                         
1214                         if(rawSequence.length() < oligo.length()){
1215                                 success = 0;
1216                                 break;
1217                         }
1218                         
1219                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1220                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1221                                 if(qual.getName() != ""){
1222                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1223                                 }
1224                                 success = 1;
1225                                 break;
1226                         }
1227                 }       
1228                 return success;
1229                 
1230         }
1231         catch(exception& e) {
1232                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1233                 exit(1);
1234         }
1235 }
1236
1237 //***************************************************************************************************************
1238
1239 bool TrimSeqsCommand::cullLength(Sequence& seq){
1240         try {
1241         
1242                 int length = seq.getNumBases();
1243                 bool success = 0;       //guilty until proven innocent
1244                 
1245                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1246                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1247                 else                                                                                            {       success = 0;    }
1248                 
1249                 return success;
1250         
1251         }
1252         catch(exception& e) {
1253                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1254                 exit(1);
1255         }
1256         
1257 }
1258
1259 //***************************************************************************************************************
1260
1261 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1262         try {
1263                 int longHomoP = seq.getLongHomoPolymer();
1264                 bool success = 0;       //guilty until proven innocent
1265                 
1266                 if(longHomoP <= maxHomoP){      success = 1;    }
1267                 else                                    {       success = 0;    }
1268                 
1269                 return success;
1270         }
1271         catch(exception& e) {
1272                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1273                 exit(1);
1274         }
1275         
1276 }
1277
1278 //***************************************************************************************************************
1279
1280 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1281         try {
1282                 int numNs = seq.getAmbigBases();
1283                 bool success = 0;       //guilty until proven innocent
1284                 
1285                 if(numNs <= maxAmbig)   {       success = 1;    }
1286                 else                                    {       success = 0;    }
1287                 
1288                 return success;
1289         }
1290         catch(exception& e) {
1291                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1292                 exit(1);
1293         }
1294         
1295 }
1296
1297 //***************************************************************************************************************
1298
1299 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1300         try {
1301                 bool success = 1;
1302                 int length = oligo.length();
1303                 
1304                 for(int i=0;i<length;i++){
1305                         
1306                         if(oligo[i] != seq[i]){
1307                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1308                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1309                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1310                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1311                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1312                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1313                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1314                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1315                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1316                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1317                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1318                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1319                                 
1320                                 if(success == 0)        {       break;   }
1321                         }
1322                         else{
1323                                 success = 1;
1324                         }
1325                 }
1326                 
1327                 return success;
1328         }
1329         catch(exception& e) {
1330                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1331                 exit(1);
1332         }
1333
1334 }
1335 //***************************************************************************************************************
1336
1337 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1338         try {
1339
1340                 int length = oligo.length();
1341                 int countDiffs = 0;
1342                 
1343                 for(int i=0;i<length;i++){
1344                                                                 
1345                         if(oligo[i] != seq[i]){
1346                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1347                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1348                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1349                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1350                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1351                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1352                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1353                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1354                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1355                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1356                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1357                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1358                         }
1359                         
1360                 }
1361                 
1362                 return countDiffs;
1363         }
1364         catch(exception& e) {
1365                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1366                 exit(1);
1367         }
1368
1369 }
1370 //***************************************************************************************************************
1371
1372 //bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
1373 //      try {
1374 //              
1375 //              string rawSequence = seq.getUnaligned();
1376 //              int seqLength = seq.getNumBases();
1377 //              bool success = 0;       //guilty until proven innocent
1378 //              string name;
1379 //              
1380 //              qFile >> name;
1381 //              if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1382 //              
1383 //              while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1384 //              
1385 //              int score;
1386 //              int end = seqLength;
1387 //              
1388 //              for(int i=0;i<seqLength;i++){
1389 //                      qFile >> score;
1390 //                      
1391 //                      if(score < qThreshold){
1392 //                              end = i;
1393 //                              break;
1394 //                      }
1395 //              }
1396 //              for(int i=end+1;i<seqLength;i++){
1397 //                      qFile >> score;
1398 //              }
1399 //              
1400 //              seq.setUnaligned(rawSequence.substr(0,end));
1401 //              
1402 //              return 1;
1403 //      }
1404 //      catch(exception& e) {
1405 //              m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1406 //              exit(1);
1407 //      }
1408 //}
1409
1410 //***************************************************************************************************************
1411
1412 //bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1413 //      try {
1414 //              string rawSequence = seq.getUnaligned();
1415 //              int seqLength = seq.getNumBases();
1416 //              bool success = 0;       //guilty until proven innocent
1417 //              string name;
1418 //              
1419 //              qFile >> name;
1420 //              if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1421 //              
1422 //              while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1423 //              
1424 //              float score;    
1425 //              float average = 0;
1426 //              
1427 //              for(int i=0;i<seqLength;i++){
1428 //                      qFile >> score;
1429 //                      average += score;
1430 //              }
1431 //              average /= seqLength;
1432 //
1433 //              if(average >= qAverage) {       success = 1;    }
1434 //              else                                    {       success = 0;    }
1435 //              
1436 //              return success;
1437 //      }
1438 //      catch(exception& e) {
1439 //              m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1440 //              exit(1);
1441 //      }
1442 //}
1443
1444 //***************************************************************************************************************