]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
added citation function to commands
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19                 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
20                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
21                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
22                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
23                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
24                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
25                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
26                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
29                 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
30                 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
31                 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
32                 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
33                 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
34                 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
35                 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
36                 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
37                 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                         
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimSeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string TrimSeqsCommand::getHelpString(){        
52         try {
53                 string helpString = "";
54                 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
55                 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
56                 helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
57                 helpString += "The fasta parameter is required.\n";
58                 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
59                 helpString += "The oligos parameter allows you to provide an oligos file.\n";
60                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
61                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
62                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
63                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
64                 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
65                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
66                 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
67                 helpString += "The qfile parameter allows you to provide a quality file.\n";
68                 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
69                 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
70                 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
71                 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
72                 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
73                 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
74                 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
75                 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
76                 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
77                 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
78                 helpString += "The trim.seqs command should be in the following format: \n";
79                 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n";
80                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
81                 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
82                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
83                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
84                 return helpString;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
88                 exit(1);
89         }
90 }
91
92
93 //**********************************************************************************************************************
94
95 TrimSeqsCommand::TrimSeqsCommand(){     
96         try {
97                 abort = true; calledHelp = true; 
98                 setParameters();
99                 vector<string> tempOutNames;
100                 outputTypes["fasta"] = tempOutNames;
101                 outputTypes["qfile"] = tempOutNames;
102                 outputTypes["group"] = tempOutNames;
103         }
104         catch(exception& e) {
105                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
106                 exit(1);
107         }
108 }
109 //***************************************************************************************************************
110
111 TrimSeqsCommand::TrimSeqsCommand(string option)  {
112         try {
113                 
114                 abort = false; calledHelp = false;   
115                 comboStarts = 0;
116                 
117                 //allow user to run help
118                 if(option == "help") { help(); abort = true; calledHelp = true; }
119                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
120                 
121                 else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string,string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string,string>::iterator it;
129                         
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["qfile"] = tempOutNames;
139                         outputTypes["group"] = tempOutNames;
140                         
141                         //if the user changes the input directory command factory will send this info to us in the output parameter 
142                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
143                         if (inputDir == "not found"){   inputDir = "";          }
144                         else {
145                                 string path;
146                                 it = parameters.find("fasta");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
152                                 }
153                                 
154                                 it = parameters.find("oligos");
155                                 //user has given a template file
156                                 if(it != parameters.end()){ 
157                                         path = m->hasPath(it->second);
158                                         //if the user has not given a path then, add inputdir. else leave path alone.
159                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
160                                 }
161                                 
162                                 it = parameters.find("qfile");
163                                 //user has given a template file
164                                 if(it != parameters.end()){ 
165                                         path = m->hasPath(it->second);
166                                         //if the user has not given a path then, add inputdir. else leave path alone.
167                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
168                                 }
169                                 
170                         }
171
172                         
173                         //check for required parameters
174                         fastaFile = validParameter.validFile(parameters, "fasta", true);
175                         if (fastaFile == "not found") {                                 
176                                 fastaFile = m->getFastaFile(); 
177                                 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
178                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
179                         }else if (fastaFile == "not open") { abort = true; }    
180                         
181                         //if the user changes the output directory command factory will send this info to us in the output parameter 
182                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
183                                 outputDir = ""; 
184                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
185                         }
186                 
187                         
188                         //check for optional parameter and set defaults
189                         // ...at some point should added some additional type checking...
190                         string temp;
191                         temp = validParameter.validFile(parameters, "flip", false);
192                         if (temp == "not found"){       flip = 0;       }
193                         else if(m->isTrue(temp))        {       flip = 1;       }
194                 
195                         temp = validParameter.validFile(parameters, "oligos", true);
196                         if (temp == "not found"){       oligoFile = "";         }
197                         else if(temp == "not open"){    abort = true;   } 
198                         else                                    {       oligoFile = temp;               }
199                         
200                         
201                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
202                         convert(temp, maxAmbig);  
203
204                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
205                         convert(temp, maxHomoP);  
206
207                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
208                         convert(temp, minLength); 
209                         
210                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
211                         convert(temp, maxLength);
212                         
213                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
214                         convert(temp, bdiffs);
215                         
216                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
217                         convert(temp, pdiffs);
218                         
219                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
220                         convert(temp, tdiffs);
221                         
222                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
223                         
224                         temp = validParameter.validFile(parameters, "qfile", true);     
225                         if (temp == "not found")        {       qFileName = "";         }
226                         else if(temp == "not open")     {       abort = true;           }
227                         else                                            {       qFileName = temp;       }
228                         
229                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
230                         convert(temp, qThreshold);
231                         
232                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
233                         qtrim = m->isTrue(temp);
234
235                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
236                         convert(temp, qRollAverage);
237
238                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
239                         convert(temp, qWindowAverage);
240
241                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
242                         convert(temp, qWindowSize);
243
244                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
245                         convert(temp, qWindowStep);
246
247                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
248                         convert(temp, qAverage);
249
250                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
251                         convert(temp, keepFirst);
252
253                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
254                         convert(temp, removeLast);
255                         
256                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
257                         allFiles = m->isTrue(temp);
258                         
259                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
260                         m->setProcessors(temp);
261                         convert(temp, processors); 
262                         
263                         
264                         if(allFiles && (oligoFile == "")){
265                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
266                         }
267                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
268                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
269                                 qAverage=0;
270                                 qThreshold=0;
271                         }
272                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
273                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
274                                 abort = true;
275                         }
276                 }
277
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
281                 exit(1);
282         }
283 }
284 //***************************************************************************************************************
285
286 int TrimSeqsCommand::execute(){
287         try{
288         
289                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
290                 
291                 numFPrimers = 0;  //this needs to be initialized
292                 numRPrimers = 0;
293                 vector<vector<string> > fastaFileNames;
294                 vector<vector<string> > qualFileNames;
295                 
296                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
297                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
298                 
299                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
300                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
301                 
302                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
303                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
304                 if (qFileName != "") {
305                         outputNames.push_back(trimQualFile);
306                         outputNames.push_back(scrapQualFile);
307                         outputTypes["qfile"].push_back(trimQualFile);
308                         outputTypes["qfile"].push_back(scrapQualFile); 
309                 }
310                 
311                 string outputGroupFileName;
312                 if(oligoFile != ""){
313                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
314                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
315                         getOligos(fastaFileNames, qualFileNames);
316                 }
317
318                 vector<unsigned long int> fastaFilePos;
319                 vector<unsigned long int> qFilePos;
320                 
321                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
322                 
323                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
324                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
325                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
326                 }       
327                 if(qFileName == "")     {       qLines = lines; } //files with duds
328                 
329                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
330                                 if(processors == 1){
331                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
332                                 }else{
333                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
334                                 }       
335                 #else
336                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
337                 #endif
338                 
339                 if (m->control_pressed) {  return 0; }                  
340                         
341                 if(allFiles){
342                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
343                         map<string, string>::iterator it;
344                         set<string> namesToRemove;
345                         for(int i=0;i<fastaFileNames.size();i++){
346                                 for(int j=0;j<fastaFileNames[0].size();j++){
347                                         if (fastaFileNames[i][j] != "") {
348                                                 if(m->isBlank(fastaFileNames[i][j])){
349                                                         remove(fastaFileNames[i][j].c_str());
350                                                         namesToRemove.insert(fastaFileNames[i][j]);
351                                                         
352                                                         if(qFileName != ""){
353                                                                 remove(qualFileNames[i][j].c_str());
354                                                                 namesToRemove.insert(qualFileNames[i][j]);
355                                                         }
356                                                 }else{  
357                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
358                                                         if (it == uniqueFastaNames.end()) {     
359                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
360                                                         }       
361                                                 }
362                                         }
363                                 }
364                         }
365                         
366                         //remove names for outputFileNames, just cleans up the output
367                         vector<string> outputNames2;
368                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
369                         outputNames = outputNames2;
370                         
371                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
372                                 ifstream in;
373                                 m->openInputFile(it->first, in);
374                                 
375                                 ofstream out;
376                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
377                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
378                                 m->openOutputFile(thisGroupName, out);
379                                 
380                                 while (!in.eof()){
381                                         if (m->control_pressed) { break; }
382                                         
383                                         Sequence currSeq(in); m->gobble(in);
384                                         out << currSeq.getName() << '\t' << it->second << endl;
385                                 }
386                                 in.close();
387                                 out.close();
388                         }
389                 }
390                 
391                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
392
393                 //output group counts
394                 m->mothurOutEndLine();
395                 int total = 0;
396                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
397                          total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); 
398                 }
399                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
400                 
401                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
402
403                 //set fasta file as new current fastafile
404                 string current = "";
405                 itTypes = outputTypes.find("fasta");
406                 if (itTypes != outputTypes.end()) {
407                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
408                 }
409                 
410                 itTypes = outputTypes.find("qfile");
411                 if (itTypes != outputTypes.end()) {
412                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
413                 }
414                 
415                 itTypes = outputTypes.find("group");
416                 if (itTypes != outputTypes.end()) {
417                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
418                 }
419
420                 m->mothurOutEndLine();
421                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
422                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
423                 m->mothurOutEndLine();
424                 
425                 return 0;       
426                         
427         }
428         catch(exception& e) {
429                 m->errorOut(e, "TrimSeqsCommand", "execute");
430                 exit(1);
431         }
432 }
433                 
434 /**************************************************************************************/
435
436 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
437                 
438         try {
439                 
440                 ofstream trimFASTAFile;
441                 m->openOutputFile(trimFileName, trimFASTAFile);
442                 
443                 ofstream scrapFASTAFile;
444                 m->openOutputFile(scrapFileName, scrapFASTAFile);
445                 
446                 ofstream trimQualFile;
447                 ofstream scrapQualFile;
448                 if(qFileName != ""){
449                         m->openOutputFile(trimQFileName, trimQualFile);
450                         m->openOutputFile(scrapQFileName, scrapQualFile);
451                 }
452                 
453                 ofstream outGroupsFile;
454                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
455                 if(allFiles){
456                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
457                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
458                                         if (fastaFileNames[i][j] != "") {
459                                                 ofstream temp;
460                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
461                                                 if(qFileName != ""){
462                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
463                                                 }
464                                         }
465                                 }
466                         }
467                 }
468                 
469                 ifstream inFASTA;
470                 m->openInputFile(filename, inFASTA);
471                 inFASTA.seekg(line->start);
472                 
473                 ifstream qFile;
474                 if(qFileName != "")     {
475                         m->openInputFile(qFileName, qFile);
476                         qFile.seekg(qline->start);  
477                 }
478                 
479                 int count = 0;
480                 bool moreSeqs = 1;
481         
482                 while (moreSeqs) {
483                                 
484                         if (m->control_pressed) { 
485                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
486                                 if (oligoFile != "") {   outGroupsFile.close();   }
487
488                                 if(qFileName != ""){
489                                         qFile.close();
490                                 }
491                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
492
493                                 return 0;
494                         }
495                         
496                         int success = 1;
497                         string trashCode = "";
498                         int currentSeqsDiffs = 0;
499
500                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
501                         
502                         QualityScores currQual;
503                         if(qFileName != ""){
504                                 currQual = QualityScores(qFile);  m->gobble(qFile);
505                         }
506
507                         string origSeq = currSeq.getUnaligned();
508                         if (origSeq != "") {
509                                 
510                                 int barcodeIndex = 0;
511                                 int primerIndex = 0;
512                                 
513                                 if(barcodes.size() != 0){
514                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
515                                         if(success > bdiffs)            {       trashCode += 'b';       }
516                                         else{ currentSeqsDiffs += success;  }
517                                 }
518                                 
519                                 if(numFPrimers != 0){
520                                         success = stripForward(currSeq, currQual, primerIndex);
521                                         if(success > pdiffs)            {       trashCode += 'f';       }
522                                         else{ currentSeqsDiffs += success;  }
523                                 }
524                                 
525                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
526                                 
527                                 if(numRPrimers != 0){
528                                         success = stripReverse(currSeq, currQual);
529                                         if(!success)                            {       trashCode += 'r';       }
530                                 }
531
532                                 if(keepFirst != 0){
533                                         success = keepFirstTrim(currSeq, currQual);
534                                 }
535                                 
536                                 if(removeLast != 0){
537                                         success = removeLastTrim(currSeq, currQual);
538                                         if(!success)                            {       trashCode += 'l';       }
539                                 }
540
541                                 
542                                 if(qFileName != ""){
543                                         int origLength = currSeq.getNumBases();
544                                         
545                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
546                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
547                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
548                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
549                                         else                                            {       success = 1;                            }
550                                 
551                                         //you don't want to trim, if it fails above then scrap it
552                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
553                                         
554                                         if(!success)                            {       trashCode += 'q';       }
555                                 }                               
556                 
557                                 if(minLength > 0 || maxLength > 0){
558                                         success = cullLength(currSeq);
559                                         if(!success)                            {       trashCode += 'l';       }
560                                 }
561                                 if(maxHomoP > 0){
562                                         success = cullHomoP(currSeq);
563                                         if(!success)                            {       trashCode += 'h';       }
564                                 }
565                                 if(maxAmbig != -1){
566                                         success = cullAmbigs(currSeq);
567                                         if(!success)                            {       trashCode += 'n';       }
568                                 }
569                                 
570                                 if(flip){               // should go last                       
571                                         currSeq.reverseComplement();
572                                         if(qFileName != ""){
573                                                 currQual.flipQScores(); 
574                                         }
575                                 }
576                                 
577                                 if(trashCode.length() == 0){
578                                         currSeq.setAligned(currSeq.getUnaligned());
579                                         currSeq.printSequence(trimFASTAFile);
580                                         
581                                         if(qFileName != ""){
582                                                 currQual.printQScores(trimQualFile);
583                                         }
584                                         
585                                         if(barcodes.size() != 0){
586                                                 string thisGroup = barcodeNameVector[barcodeIndex];
587                                                 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
588                                                 
589                                                 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
590                                                 
591                                                 map<string, int>::iterator it = groupCounts.find(thisGroup);
592                                                 if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
593                                                 else { groupCounts[it->first]++; }
594                                                         
595                                         }
596                                         
597                                         
598                                         if(allFiles){
599                                                 ofstream output;
600                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
601                                                 currSeq.printSequence(output);
602                                                 output.close();
603                                                 
604                                                 if(qFileName != ""){
605                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
606                                                         currQual.printQScores(output);
607                                                         output.close();                                                 
608                                                 }
609                                         }
610                                 }
611                                 else{
612                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
613                                         currSeq.setUnaligned(origSeq);
614                                         currSeq.setAligned(origSeq);
615                                         currSeq.printSequence(scrapFASTAFile);
616                                         if(qFileName != ""){
617                                                 currQual.printQScores(scrapQualFile);
618                                         }
619                                 }
620                                 count++;
621                         }
622                         
623                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
624                                 unsigned long int pos = inFASTA.tellg();
625                                 if ((pos == -1) || (pos >= line->end)) { break; }
626                         #else
627                                 if (inFASTA.eof()) { break; }
628                         #endif
629                                 
630                         //report progress
631                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
632                         
633                 }
634                 //report progress
635                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
636
637                 
638                 inFASTA.close();
639                 trimFASTAFile.close();
640                 scrapFASTAFile.close();
641                 if (oligoFile != "") {   outGroupsFile.close();   }
642                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
643                 
644                 return count;
645         }
646         catch(exception& e) {
647                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
648                 exit(1);
649         }
650 }
651
652 /**************************************************************************************************/
653
654 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
655         try {
656 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
657                 int process = 1;
658                 int exitCommand = 1;
659                 processIDS.clear();
660                 
661                 //loop through and create all the processes you want
662                 while (process != processors) {
663                         int pid = fork();
664                         
665                         if (pid > 0) {
666                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
667                                 process++;
668                         }else if (pid == 0){
669                                 
670                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
671                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
672
673                                 if(allFiles){
674                                         ofstream temp;
675
676                                         for(int i=0;i<tempFASTAFileNames.size();i++){
677                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
678                                                         if (tempFASTAFileNames[i][j] != "") {
679                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
680                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
681
682                                                                 if(qFileName != ""){
683                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
684                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
685                                                                 }
686                                                         }
687                                                 }
688                                         }
689                                 }
690                                                         
691                                 driverCreateTrim(filename,
692                                                                  qFileName,
693                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
694                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
695                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
696                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
697                                                                  (groupFile + toString(getpid()) + ".temp"),
698                                                                  tempFASTAFileNames,
699                                                                  tempPrimerQualFileNames,
700                                                                  lines[process],
701                                                                  qLines[process]);
702                                 
703                                 //pass groupCounts to parent
704                                 ofstream out;
705                                 string tempFile = filename + toString(getpid()) + ".num.temp";
706                                 m->openOutputFile(tempFile, out);
707                                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
708                                         out << it->first << '\t' << it->second << endl;
709                                 }
710                                 out.close();
711                                 
712                                 exit(0);
713                         }else { 
714                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
715                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
716                                 exit(0);
717                         }
718                 }
719                 
720                 //parent do my part
721                 ofstream temp;
722                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
723                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
724                 m->openOutputFile(trimQualFileName, temp);              temp.close();
725                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
726
727                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
728                 
729                 //force parent to wait until all the processes are done
730                 for (int i=0;i<processIDS.size();i++) { 
731                         int temp = processIDS[i];
732                         wait(&temp);
733                 }
734                 
735                 //append files
736                 for(int i=0;i<processIDS.size();i++){
737                         
738                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
739                         
740                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
741                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
742                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
743                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
744                         
745                         if(qFileName != ""){
746                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
747                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
748                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
749                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
750                         }
751                         
752                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
753                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
754                         
755                         
756                         if(allFiles){
757                                 for(int j=0;j<fastaFileNames.size();j++){
758                                         for(int k=0;k<fastaFileNames[j].size();k++){
759                                                 if (fastaFileNames[j][k] != "") {
760                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
761                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
762                                                         
763                                                         if(qFileName != ""){
764                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
765                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
766                                                         }
767                                                 }
768                                         }
769                                 }
770                         }
771                         
772                         ifstream in;
773                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
774                         m->openInputFile(tempFile, in);
775                         int tempNum;
776                         string group;
777                         while (!in.eof()) { 
778                                 in >> group >> tempNum; m->gobble(in);
779                                 
780                                 map<string, int>::iterator it = groupCounts.find(group);
781                                 if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
782                                 else { groupCounts[it->first] += tempNum; }
783                         }
784                         in.close(); remove(tempFile.c_str());
785                         
786                 }
787         
788                 return exitCommand;
789 #endif          
790         }
791         catch(exception& e) {
792                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
793                 exit(1);
794         }
795 }
796
797 /**************************************************************************************************/
798
799 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
800         try {
801                 
802                 //set file positions for fasta file
803                 fastaFilePos = m->divideFile(filename, processors);
804                 
805                 if (qfilename == "") { return processors; }
806                 
807                 //get name of first sequence in each chunk
808                 map<string, int> firstSeqNames;
809                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
810                         ifstream in;
811                         m->openInputFile(filename, in);
812                         in.seekg(fastaFilePos[i]);
813                 
814                         Sequence temp(in); 
815                         firstSeqNames[temp.getName()] = i;
816                 
817                         in.close();
818                 }
819                                 
820                 //seach for filePos of each first name in the qfile and save in qfileFilePos
821                 ifstream inQual;
822                 m->openInputFile(qfilename, inQual);
823                 
824                 string input;
825                 while(!inQual.eof()){   
826                         input = m->getline(inQual);
827
828                         if (input.length() != 0) {
829                                 if(input[0] == '>'){ //this is a sequence name line
830                                         istringstream nameStream(input);
831                                         
832                                         string sname = "";  nameStream >> sname;
833                                         sname = sname.substr(1);
834                                         
835                                         map<string, int>::iterator it = firstSeqNames.find(sname);
836                                         
837                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
838                                                 unsigned long int pos = inQual.tellg(); 
839                                                 qfileFilePos.push_back(pos - input.length() - 1);       
840                                                 firstSeqNames.erase(it);
841                                         }
842                                 }
843                         }
844                         
845                         if (firstSeqNames.size() == 0) { break; }
846                 }
847                 inQual.close();
848                 
849                 
850                 if (firstSeqNames.size() != 0) { 
851                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
852                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
853                         }
854                         qFileName = "";
855                         return processors;
856                 }
857
858                 //get last file position of qfile
859                 FILE * pFile;
860                 unsigned long int size;
861                 
862                 //get num bytes in file
863                 pFile = fopen (qfilename.c_str(),"rb");
864                 if (pFile==NULL) perror ("Error opening file");
865                 else{
866                         fseek (pFile, 0, SEEK_END);
867                         size=ftell (pFile);
868                         fclose (pFile);
869                 }
870                 
871                 qfileFilePos.push_back(size);
872                 
873                 return processors;
874         }
875         catch(exception& e) {
876                 m->errorOut(e, "TrimSeqsCommand", "setLines");
877                 exit(1);
878         }
879 }
880
881 //***************************************************************************************************************
882
883 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
884         try {
885                 ifstream inOligos;
886                 m->openInputFile(oligoFile, inOligos);
887                 
888                 ofstream test;
889                 
890                 string type, oligo, group;
891
892                 int indexPrimer = 0;
893                 int indexBarcode = 0;
894                 
895                 while(!inOligos.eof()){
896
897                         inOligos >> type; 
898                                         
899                         if(type[0] == '#'){
900                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
901                                 m->gobble(inOligos);
902                         }
903                         else{
904                                 m->gobble(inOligos);
905                                 //make type case insensitive
906                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
907                                 
908                                 inOligos >> oligo;
909                                 
910                                 for(int i=0;i<oligo.length();i++){
911                                         oligo[i] = toupper(oligo[i]);
912                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
913                                 }
914                                 
915                                 if(type == "FORWARD"){
916                                         group = "";
917                                         
918                                         // get rest of line in case there is a primer name
919                                         while (!inOligos.eof()) {       
920                                                 char c = inOligos.get(); 
921                                                 if (c == 10 || c == 13){        break;  }
922                                                 else if (c == 32 || c == 9){;} //space or tab
923                                                 else {  group += c;  }
924                                         } 
925                                         
926                                         //check for repeat barcodes
927                                         map<string, int>::iterator itPrime = primers.find(oligo);
928                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
929                                         
930                                         primers[oligo]=indexPrimer; indexPrimer++;              
931                                         primerNameVector.push_back(group);
932                                 }
933                                 else if(type == "REVERSE"){
934                                         Sequence oligoRC("reverse", oligo);
935                                         oligoRC.reverseComplement();
936                                         revPrimer.push_back(oligoRC.getUnaligned());
937                                 }
938                                 else if(type == "BARCODE"){
939                                         inOligos >> group;
940                                         
941                                         //check for repeat barcodes
942                                         map<string, int>::iterator itBar = barcodes.find(oligo);
943                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
944                                                 
945                                         barcodes[oligo]=indexBarcode; indexBarcode++;
946                                         barcodeNameVector.push_back(group);
947                                 }
948                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
949                         }
950                         m->gobble(inOligos);
951                 }       
952                 inOligos.close();
953                 
954                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
955                 
956                 //add in potential combos
957                 if(barcodeNameVector.size() == 0){
958                         barcodes[""] = 0;
959                         barcodeNameVector.push_back("");                        
960                 }
961                 
962                 if(primerNameVector.size() == 0){
963                         primers[""] = 0;
964                         primerNameVector.push_back("");                 
965                 }
966                 
967                 fastaFileNames.resize(barcodeNameVector.size());
968                 for(int i=0;i<fastaFileNames.size();i++){
969                         fastaFileNames[i].assign(primerNameVector.size(), "");
970                 }
971                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
972                 
973                 if(allFiles){
974                         set<string> uniqueNames; //used to cleanup outputFileNames
975                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
976                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
977                                         
978                                         string primerName = primerNameVector[itPrimer->second];
979                                         string barcodeName = barcodeNameVector[itBar->second];
980                                         
981                                         string comboGroupName = "";
982                                         string fastaFileName = "";
983                                         string qualFileName = "";
984                                         
985                                         if(primerName == ""){
986                                                 comboGroupName = barcodeNameVector[itBar->second];
987                                         }
988                                         else{
989                                                 if(barcodeName == ""){
990                                                         comboGroupName = primerNameVector[itPrimer->second];
991                                                 }
992                                                 else{
993                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
994                                                 }
995                                         }
996
997                                         ofstream temp;
998                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
999                                         if (uniqueNames.count(fastaFileName) == 0) {
1000                                                 outputNames.push_back(fastaFileName);
1001                                                 outputTypes["fasta"].push_back(fastaFileName);
1002                                                 uniqueNames.insert(fastaFileName);
1003                                         }
1004                                         
1005                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1006                                         m->openOutputFile(fastaFileName, temp);         temp.close();
1007
1008                                         if(qFileName != ""){
1009                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1010                                                 if (uniqueNames.count(fastaFileName) == 0) {
1011                                                         outputNames.push_back(qualFileName);
1012                                                         outputTypes["qfile"].push_back(qualFileName);
1013                                                 }
1014                                                 
1015                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1016                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1017                                         }
1018                                 }
1019                         }
1020                 }
1021                 numFPrimers = primers.size();
1022                 numRPrimers = revPrimer.size();
1023
1024         }
1025         catch(exception& e) {
1026                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1027                 exit(1);
1028         }
1029 }
1030
1031 //***************************************************************************************************************
1032
1033 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1034         try {
1035                 
1036                 string rawSequence = seq.getUnaligned();
1037                 int success = bdiffs + 1;       //guilty until proven innocent
1038                 
1039                 //can you find the barcode
1040                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1041                         string oligo = it->first;
1042                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1043                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1044                                 break;  
1045                         }
1046                         
1047                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1048                                 group = it->second;
1049                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1050                                 
1051                                 if(qual.getName() != ""){
1052                                         qual.trimQScores(oligo.length(), -1);
1053                                 }
1054                                 
1055                                 success = 0;
1056                                 break;
1057                         }
1058                 }
1059                 
1060                 //if you found the barcode or if you don't want to allow for diffs
1061                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1062                 
1063                 else { //try aligning and see if you can find it
1064
1065                         int maxLength = 0;
1066
1067                         Alignment* alignment;
1068                         if (barcodes.size() > 0) {
1069                                 map<string,int>::iterator it=barcodes.begin();
1070
1071                                 for(it;it!=barcodes.end();it++){
1072                                         if(it->first.length() > maxLength){
1073                                                 maxLength = it->first.length();
1074                                         }
1075                                 }
1076                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1077
1078                         }else{ alignment = NULL; } 
1079                         
1080                         //can you find the barcode
1081                         int minDiff = 1e6;
1082                         int minCount = 1;
1083                         int minGroup = -1;
1084                         int minPos = 0;
1085                         
1086                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1087                                 string oligo = it->first;
1088 //                              int length = oligo.length();
1089                                 
1090                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1091                                         success = bdiffs + 10;
1092                                         break;
1093                                 }
1094                                 
1095                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1096                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1097                                 oligo = alignment->getSeqAAln();
1098                                 string temp = alignment->getSeqBAln();
1099                 
1100                                 int alnLength = oligo.length();
1101                                 
1102                                 for(int i=oligo.length()-1;i>=0;i--){
1103                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1104                                 }
1105                                 oligo = oligo.substr(0,alnLength);
1106                                 temp = temp.substr(0,alnLength);
1107                                 
1108                                 int numDiff = countDiffs(oligo, temp);
1109                                 
1110                                 if(numDiff < minDiff){
1111                                         minDiff = numDiff;
1112                                         minCount = 1;
1113                                         minGroup = it->second;
1114                                         minPos = 0;
1115                                         for(int i=0;i<alnLength;i++){
1116                                                 if(temp[i] != '-'){
1117                                                         minPos++;
1118                                                 }
1119                                         }
1120                                 }
1121                                 else if(numDiff == minDiff){
1122                                         minCount++;
1123                                 }
1124
1125                         }
1126
1127                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1128                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1129                         else{                                                                                                   //use the best match
1130                                 group = minGroup;
1131                                 seq.setUnaligned(rawSequence.substr(minPos));
1132                                 
1133                                 if(qual.getName() != ""){
1134                                         qual.trimQScores(minPos, -1);
1135                                 }
1136                                 success = minDiff;
1137                         }
1138                         
1139                         if (alignment != NULL) {  delete alignment;  }
1140                         
1141                 }
1142                 
1143                 return success;
1144                 
1145         }
1146         catch(exception& e) {
1147                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1148                 exit(1);
1149         }
1150
1151 }
1152
1153 //***************************************************************************************************************
1154
1155 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1156         try {
1157                 string rawSequence = seq.getUnaligned();
1158                 int success = pdiffs + 1;       //guilty until proven innocent
1159                 
1160                 //can you find the primer
1161                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1162                         string oligo = it->first;
1163                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1164                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1165                                 break;  
1166                         }
1167                         
1168                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1169                                 group = it->second;
1170                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1171                                 if(qual.getName() != ""){
1172                                         qual.trimQScores(oligo.length(), -1);
1173                                 }
1174                                 success = 0;
1175                                 break;
1176                         }
1177                 }
1178
1179                 //if you found the barcode or if you don't want to allow for diffs
1180                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1181                 
1182                 else { //try aligning and see if you can find it
1183
1184                         int maxLength = 0;
1185
1186                         Alignment* alignment;
1187                         if (primers.size() > 0) {
1188                                 map<string,int>::iterator it=primers.begin();
1189
1190                                 for(it;it!=primers.end();it++){
1191                                         if(it->first.length() > maxLength){
1192                                                 maxLength = it->first.length();
1193                                         }
1194                                 }
1195                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1196
1197                         }else{ alignment = NULL; } 
1198                         
1199                         //can you find the barcode
1200                         int minDiff = 1e6;
1201                         int minCount = 1;
1202                         int minGroup = -1;
1203                         int minPos = 0;
1204                         
1205                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1206                                 string oligo = it->first;
1207 //                              int length = oligo.length();
1208                                 
1209                                 if(rawSequence.length() < maxLength){   
1210                                         success = pdiffs + 100;
1211                                         break;
1212                                 }
1213                                 
1214                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1215                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1216                                 oligo = alignment->getSeqAAln();
1217                                 string temp = alignment->getSeqBAln();
1218                 
1219                                 int alnLength = oligo.length();
1220                                 
1221                                 for(int i=oligo.length()-1;i>=0;i--){
1222                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1223                                 }
1224                                 oligo = oligo.substr(0,alnLength);
1225                                 temp = temp.substr(0,alnLength);
1226                                 
1227                                 int numDiff = countDiffs(oligo, temp);
1228                                 
1229                                 if(numDiff < minDiff){
1230                                         minDiff = numDiff;
1231                                         minCount = 1;
1232                                         minGroup = it->second;
1233                                         minPos = 0;
1234                                         for(int i=0;i<alnLength;i++){
1235                                                 if(temp[i] != '-'){
1236                                                         minPos++;
1237                                                 }
1238                                         }
1239                                 }
1240                                 else if(numDiff == minDiff){
1241                                         minCount++;
1242                                 }
1243
1244                         }
1245
1246                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1247                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1248                         else{                                                                                                   //use the best match
1249                                 group = minGroup;
1250                                 seq.setUnaligned(rawSequence.substr(minPos));
1251                                 if(qual.getName() != ""){
1252                                         qual.trimQScores(minPos, -1);
1253                                 }
1254                                 success = minDiff;
1255                         }
1256                         
1257                         if (alignment != NULL) {  delete alignment;  }
1258                         
1259                 }
1260                 
1261                 return success;
1262
1263         }
1264         catch(exception& e) {
1265                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1266                 exit(1);
1267         }
1268 }
1269
1270 //***************************************************************************************************************
1271
1272 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1273         try {
1274                 string rawSequence = seq.getUnaligned();
1275                 bool success = 0;       //guilty until proven innocent
1276                 
1277                 for(int i=0;i<numRPrimers;i++){
1278                         string oligo = revPrimer[i];
1279                         
1280                         if(rawSequence.length() < oligo.length()){
1281                                 success = 0;
1282                                 break;
1283                         }
1284                         
1285                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1286                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1287                                 if(qual.getName() != ""){
1288                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1289                                 }
1290                                 success = 1;
1291                                 break;
1292                         }
1293                 }       
1294                 return success;
1295                 
1296         }
1297         catch(exception& e) {
1298                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1299                 exit(1);
1300         }
1301 }
1302
1303 //***************************************************************************************************************
1304
1305 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1306         try {
1307                 bool success = 1;
1308                 if(qscores.getName() != ""){
1309                         qscores.trimQScores(-1, keepFirst);
1310                 }
1311                 sequence.trim(keepFirst);
1312                 return success;
1313         }
1314         catch(exception& e) {
1315                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1316                 exit(1);
1317         }
1318         
1319 }       
1320
1321 //***************************************************************************************************************
1322
1323 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1324         try {
1325                 bool success = 0;
1326                 
1327                 int length = sequence.getNumBases() - removeLast;
1328                 
1329                 if(length > 0){
1330                         if(qscores.getName() != ""){
1331                                 qscores.trimQScores(-1, length);
1332                         }
1333                         sequence.trim(length);
1334                         success = 1;
1335                 }
1336                 else{
1337                         success = 0;
1338                 }
1339
1340                 return success;
1341         }
1342         catch(exception& e) {
1343                 m->errorOut(e, "removeLastTrim", "countDiffs");
1344                 exit(1);
1345         }
1346         
1347 }       
1348
1349 //***************************************************************************************************************
1350
1351 bool TrimSeqsCommand::cullLength(Sequence& seq){
1352         try {
1353         
1354                 int length = seq.getNumBases();
1355                 bool success = 0;       //guilty until proven innocent
1356                 
1357                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1358                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1359                 else                                                                                            {       success = 0;    }
1360                 
1361                 return success;
1362         
1363         }
1364         catch(exception& e) {
1365                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1366                 exit(1);
1367         }
1368         
1369 }
1370
1371 //***************************************************************************************************************
1372
1373 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1374         try {
1375                 int longHomoP = seq.getLongHomoPolymer();
1376                 bool success = 0;       //guilty until proven innocent
1377                 
1378                 if(longHomoP <= maxHomoP){      success = 1;    }
1379                 else                                    {       success = 0;    }
1380                 
1381                 return success;
1382         }
1383         catch(exception& e) {
1384                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1385                 exit(1);
1386         }
1387         
1388 }
1389
1390 //***************************************************************************************************************
1391
1392 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1393         try {
1394                 int numNs = seq.getAmbigBases();
1395                 bool success = 0;       //guilty until proven innocent
1396                 
1397                 if(numNs <= maxAmbig)   {       success = 1;    }
1398                 else                                    {       success = 0;    }
1399                 
1400                 return success;
1401         }
1402         catch(exception& e) {
1403                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1404                 exit(1);
1405         }
1406         
1407 }
1408
1409 //***************************************************************************************************************
1410
1411 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1412         try {
1413                 bool success = 1;
1414                 int length = oligo.length();
1415                 
1416                 for(int i=0;i<length;i++){
1417                         
1418                         if(oligo[i] != seq[i]){
1419                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1420                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1421                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1422                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1423                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1424                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1425                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1426                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1427                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1428                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1429                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1430                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1431                                 
1432                                 if(success == 0)        {       break;   }
1433                         }
1434                         else{
1435                                 success = 1;
1436                         }
1437                 }
1438                 
1439                 return success;
1440         }
1441         catch(exception& e) {
1442                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1443                 exit(1);
1444         }
1445
1446 }
1447
1448 //***************************************************************************************************************
1449
1450 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1451         try {
1452
1453                 int length = oligo.length();
1454                 int countDiffs = 0;
1455                 
1456                 for(int i=0;i<length;i++){
1457                                                                 
1458                         if(oligo[i] != seq[i]){
1459                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1460                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1461                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1462                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1463                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1464                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1465                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1466                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1467                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1468                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1469                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1470                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1471                         }
1472                         
1473                 }
1474                 
1475                 return countDiffs;
1476         }
1477         catch(exception& e) {
1478                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1479                 exit(1);
1480         }
1481
1482 }
1483
1484 //***************************************************************************************************************