]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
working on citations
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19                 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
20                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
21                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
22                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
23                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
24                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
25                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
26                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
29                 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
30                 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
31                 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
32                 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
33                 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
34                 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
35                 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
36                 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
37                 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
38                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
39                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40                         
41                 vector<string> myArray;
42                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimSeqsCommand", "setParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string TrimSeqsCommand::getHelpString(){        
52         try {
53                 string helpString = "";
54                 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
55                 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
56                 helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
57                 helpString += "The fasta parameter is required.\n";
58                 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
59                 helpString += "The oligos parameter allows you to provide an oligos file.\n";
60                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
61                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
62                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
63                 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
64                 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
65                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
66                 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
67                 helpString += "The qfile parameter allows you to provide a quality file.\n";
68                 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
69                 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
70                 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
71                 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
72                 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
73                 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
74                 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
75                 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
76                 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
77                 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
78                 helpString += "The trim.seqs command should be in the following format: \n";
79                 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n";
80                 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n";    
81                 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
82                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
83                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
84                 return helpString;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
88                 exit(1);
89         }
90 }
91
92
93 //**********************************************************************************************************************
94
95 TrimSeqsCommand::TrimSeqsCommand(){     
96         try {
97                 abort = true; calledHelp = true; 
98                 setParameters();
99                 vector<string> tempOutNames;
100                 outputTypes["fasta"] = tempOutNames;
101                 outputTypes["qfile"] = tempOutNames;
102                 outputTypes["group"] = tempOutNames;
103         }
104         catch(exception& e) {
105                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
106                 exit(1);
107         }
108 }
109 //***************************************************************************************************************
110
111 TrimSeqsCommand::TrimSeqsCommand(string option)  {
112         try {
113                 
114                 abort = false; calledHelp = false;   
115                 comboStarts = 0;
116                 
117                 //allow user to run help
118                 if(option == "help") { help(); abort = true; calledHelp = true; }
119                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
120                 
121                 else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string,string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string,string>::iterator it;
129                         
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["qfile"] = tempOutNames;
139                         outputTypes["group"] = tempOutNames;
140                         
141                         //if the user changes the input directory command factory will send this info to us in the output parameter 
142                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
143                         if (inputDir == "not found"){   inputDir = "";          }
144                         else {
145                                 string path;
146                                 it = parameters.find("fasta");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
152                                 }
153                                 
154                                 it = parameters.find("oligos");
155                                 //user has given a template file
156                                 if(it != parameters.end()){ 
157                                         path = m->hasPath(it->second);
158                                         //if the user has not given a path then, add inputdir. else leave path alone.
159                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
160                                 }
161                                 
162                                 it = parameters.find("qfile");
163                                 //user has given a template file
164                                 if(it != parameters.end()){ 
165                                         path = m->hasPath(it->second);
166                                         //if the user has not given a path then, add inputdir. else leave path alone.
167                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
168                                 }
169                                 
170                         }
171
172                         
173                         //check for required parameters
174                         fastaFile = validParameter.validFile(parameters, "fasta", true);
175                         if (fastaFile == "not found") {                                 
176                                 fastaFile = m->getFastaFile(); 
177                                 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
178                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
179                         }else if (fastaFile == "not open") { abort = true; }    
180                         
181                         //if the user changes the output directory command factory will send this info to us in the output parameter 
182                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
183                                 outputDir = ""; 
184                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
185                         }
186                 
187                         
188                         //check for optional parameter and set defaults
189                         // ...at some point should added some additional type checking...
190                         string temp;
191                         temp = validParameter.validFile(parameters, "flip", false);
192                         if (temp == "not found"){       flip = 0;       }
193                         else if(m->isTrue(temp))        {       flip = 1;       }
194                 
195                         temp = validParameter.validFile(parameters, "oligos", true);
196                         if (temp == "not found"){       oligoFile = "";         }
197                         else if(temp == "not open"){    abort = true;   } 
198                         else                                    {       oligoFile = temp;               }
199                         
200                         
201                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
202                         convert(temp, maxAmbig);  
203
204                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
205                         convert(temp, maxHomoP);  
206
207                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
208                         convert(temp, minLength); 
209                         
210                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
211                         convert(temp, maxLength);
212                         
213                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
214                         convert(temp, bdiffs);
215                         
216                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
217                         convert(temp, pdiffs);
218                         
219                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
220                         convert(temp, tdiffs);
221                         
222                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
223                         
224                         temp = validParameter.validFile(parameters, "qfile", true);     
225                         if (temp == "not found")        {       qFileName = "";         }
226                         else if(temp == "not open")     {       abort = true;           }
227                         else                                            {       qFileName = temp;       }
228                         
229                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
230                         convert(temp, qThreshold);
231                         
232                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "t"; }
233                         qtrim = m->isTrue(temp);
234
235                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
236                         convert(temp, qRollAverage);
237
238                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
239                         convert(temp, qWindowAverage);
240
241                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "50"; }
242                         convert(temp, qWindowSize);
243
244                         temp = validParameter.validFile(parameters, "qstepsize", false);        if (temp == "not found") { temp = "1"; }
245                         convert(temp, qWindowStep);
246
247                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
248                         convert(temp, qAverage);
249
250                         temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
251                         convert(temp, keepFirst);
252
253                         temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
254                         convert(temp, removeLast);
255                         
256                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
257                         allFiles = m->isTrue(temp);
258                         
259                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
260                         m->setProcessors(temp);
261                         convert(temp, processors); 
262                         
263                         
264                         if(allFiles && (oligoFile == "")){
265                                 m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
266                         }
267                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
268                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
269                                 qAverage=0;
270                                 qThreshold=0;
271                         }
272                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
273                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
274                                 abort = true;
275                         }
276                 }
277
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
281                 exit(1);
282         }
283 }
284 //***************************************************************************************************************
285
286 int TrimSeqsCommand::execute(){
287         try{
288         
289                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
290                 
291                 numFPrimers = 0;  //this needs to be initialized
292                 numRPrimers = 0;
293                 vector<vector<string> > fastaFileNames;
294                 vector<vector<string> > qualFileNames;
295                 
296                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
297                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
298                 
299                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
300                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
301                 
302                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
303                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
304                 if (qFileName != "") {
305                         outputNames.push_back(trimQualFile);
306                         outputNames.push_back(scrapQualFile);
307                         outputTypes["qfile"].push_back(trimQualFile);
308                         outputTypes["qfile"].push_back(scrapQualFile); 
309                 }
310                 
311                 string outputGroupFileName;
312                 if(oligoFile != ""){
313                         outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
314                         outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
315                         getOligos(fastaFileNames, qualFileNames);
316                 }
317
318                 vector<unsigned long int> fastaFilePos;
319                 vector<unsigned long int> qFilePos;
320                 
321                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
322                 
323                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
324                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
325                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
326                 }       
327                 if(qFileName == "")     {       qLines = lines; } //files with duds
328                 
329                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
330                                 if(processors == 1){
331                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
332                                 }else{
333                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
334                                 }       
335                 #else
336                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
337                 #endif
338                 
339                 if (m->control_pressed) {  return 0; }                  
340                         
341                 if(allFiles){
342                         map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
343                         map<string, string>::iterator it;
344                         set<string> namesToRemove;
345                         for(int i=0;i<fastaFileNames.size();i++){
346                                 for(int j=0;j<fastaFileNames[0].size();j++){
347                                         if (fastaFileNames[i][j] != "") {
348                                                 if(m->isBlank(fastaFileNames[i][j])){
349                                                         remove(fastaFileNames[i][j].c_str());
350                                                         namesToRemove.insert(fastaFileNames[i][j]);
351                                                         
352                                                         if(qFileName != ""){
353                                                                 remove(qualFileNames[i][j].c_str());
354                                                                 namesToRemove.insert(qualFileNames[i][j]);
355                                                         }
356                                                 }else{  
357                                                         it = uniqueFastaNames.find(fastaFileNames[i][j]);
358                                                         if (it == uniqueFastaNames.end()) {     
359                                                                 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
360                                                         }       
361                                                 }
362                                         }
363                                 }
364                         }
365                         
366                         //remove names for outputFileNames, just cleans up the output
367                         vector<string> outputNames2;
368                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
369                         outputNames = outputNames2;
370                         
371                         for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
372                                 ifstream in;
373                                 m->openInputFile(it->first, in);
374                                 
375                                 ofstream out;
376                                 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
377                                 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
378                                 m->openOutputFile(thisGroupName, out);
379                                 
380                                 while (!in.eof()){
381                                         if (m->control_pressed) { break; }
382                                         
383                                         Sequence currSeq(in); m->gobble(in);
384                                         out << currSeq.getName() << '\t' << it->second << endl;
385                                 }
386                                 in.close();
387                                 out.close();
388                         }
389                 }
390                 
391                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
392
393                 //output group counts
394                 m->mothurOutEndLine();
395                 int total = 0;
396                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
397                          total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); 
398                 }
399                 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
400                 
401                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
402
403                 //set fasta file as new current fastafile
404                 string current = "";
405                 itTypes = outputTypes.find("fasta");
406                 if (itTypes != outputTypes.end()) {
407                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
408                 }
409                 
410                 itTypes = outputTypes.find("qfile");
411                 if (itTypes != outputTypes.end()) {
412                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
413                 }
414                 
415                 itTypes = outputTypes.find("group");
416                 if (itTypes != outputTypes.end()) {
417                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
418                 }
419
420                 m->mothurOutEndLine();
421                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
422                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
423                 m->mothurOutEndLine();
424                 
425                 return 0;       
426                         
427         }
428         catch(exception& e) {
429                 m->errorOut(e, "TrimSeqsCommand", "execute");
430                 exit(1);
431         }
432 }
433                 
434 /**************************************************************************************/
435
436 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {        
437                 
438         try {
439                 
440                 ofstream trimFASTAFile;
441                 m->openOutputFile(trimFileName, trimFASTAFile);
442                 
443                 ofstream scrapFASTAFile;
444                 m->openOutputFile(scrapFileName, scrapFASTAFile);
445                 
446                 ofstream trimQualFile;
447                 ofstream scrapQualFile;
448                 if(qFileName != ""){
449                         m->openOutputFile(trimQFileName, trimQualFile);
450                         m->openOutputFile(scrapQFileName, scrapQualFile);
451                 }
452                 
453                 ofstream outGroupsFile;
454                 if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
455                 if(allFiles){
456                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
457                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
458                                         if (fastaFileNames[i][j] != "") {
459                                                 ofstream temp;
460                                                 m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
461                                                 if(qFileName != ""){
462                                                         m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
463                                                 }
464                                         }
465                                 }
466                         }
467                 }
468                 
469                 ifstream inFASTA;
470                 m->openInputFile(filename, inFASTA);
471                 inFASTA.seekg(line->start);
472                 
473                 ifstream qFile;
474                 if(qFileName != "")     {
475                         m->openInputFile(qFileName, qFile);
476                         qFile.seekg(qline->start);  
477                 }
478                 
479                 int count = 0;
480                 bool moreSeqs = 1;
481         
482                 while (moreSeqs) {
483                                 
484                         if (m->control_pressed) { 
485                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
486                                 if (oligoFile != "") {   outGroupsFile.close();   }
487
488                                 if(qFileName != ""){
489                                         qFile.close();
490                                 }
491                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
492
493                                 return 0;
494                         }
495                         
496                         int success = 1;
497                         string trashCode = "";
498                         int currentSeqsDiffs = 0;
499
500                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
501                         
502                         QualityScores currQual;
503                         if(qFileName != ""){
504                                 currQual = QualityScores(qFile);  m->gobble(qFile);
505                         }
506
507                         string origSeq = currSeq.getUnaligned();
508                         if (origSeq != "") {
509                                 
510                                 int barcodeIndex = 0;
511                                 int primerIndex = 0;
512                                 
513                                 if(barcodes.size() != 0){
514                                         success = stripBarcode(currSeq, currQual, barcodeIndex);
515                                         if(success > bdiffs)            {       trashCode += 'b';       }
516                                         else{ currentSeqsDiffs += success;  }
517                                 }
518                                 
519                                 if(numFPrimers != 0){
520                                         success = stripForward(currSeq, currQual, primerIndex);
521                                         if(success > pdiffs)            {       trashCode += 'f';       }
522                                         else{ currentSeqsDiffs += success;  }
523                                 }
524                                 
525                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
526                                 
527                                 if(numRPrimers != 0){
528                                         success = stripReverse(currSeq, currQual);
529                                         if(!success)                            {       trashCode += 'r';       }
530                                 }
531
532                                 if(keepFirst != 0){
533                                         success = keepFirstTrim(currSeq, currQual);
534                                 }
535                                 
536                                 if(removeLast != 0){
537                                         success = removeLastTrim(currSeq, currQual);
538                                         if(!success)                            {       trashCode += 'l';       }
539                                 }
540
541                                 
542                                 if(qFileName != ""){
543                                         int origLength = currSeq.getNumBases();
544                                         
545                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
546                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
547                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
548                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
549                                         else                                            {       success = 1;                            }
550                                 
551                                         //you don't want to trim, if it fails above then scrap it
552                                         if ((!qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
553                                         
554                                         if(!success)                            {       trashCode += 'q';       }
555                                 }                               
556                 
557                                 if(minLength > 0 || maxLength > 0){
558                                         success = cullLength(currSeq);
559                                         if(!success)                            {       trashCode += 'l';       }
560                                 }
561                                 if(maxHomoP > 0){
562                                         success = cullHomoP(currSeq);
563                                         if(!success)                            {       trashCode += 'h';       }
564                                 }
565                                 if(maxAmbig != -1){
566                                         success = cullAmbigs(currSeq);
567                                         if(!success)                            {       trashCode += 'n';       }
568                                 }
569                                 
570                                 if(flip){               // should go last                       
571                                         currSeq.reverseComplement();
572                                         if(qFileName != ""){
573                                                 currQual.flipQScores(); 
574                                         }
575                                 }
576                                 
577                                 if(trashCode.length() == 0){
578                                         currSeq.setAligned(currSeq.getUnaligned());
579                                         currSeq.printSequence(trimFASTAFile);
580                                         
581                                         if(qFileName != ""){
582                                                 currQual.printQScores(trimQualFile);
583                                         }
584                                         
585                                         if(barcodes.size() != 0){
586                                                 string thisGroup = barcodeNameVector[barcodeIndex];
587                                                 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
588                                                 
589                                                 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
590                                                 
591                                                 map<string, int>::iterator it = groupCounts.find(thisGroup);
592                                                 if (it == groupCounts.end()) {  groupCounts[thisGroup] = 1; }
593                                                 else { groupCounts[it->first]++; }
594                                                         
595                                         }
596                                         
597                                         
598                                         if(allFiles){
599                                                 ofstream output;
600                                                 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
601                                                 currSeq.printSequence(output);
602                                                 output.close();
603                                                 
604                                                 if(qFileName != ""){
605                                                         m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
606                                                         currQual.printQScores(output);
607                                                         output.close();                                                 
608                                                 }
609                                         }
610                                 }
611                                 else{
612                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
613                                         currSeq.setUnaligned(origSeq);
614                                         currSeq.setAligned(origSeq);
615                                         currSeq.printSequence(scrapFASTAFile);
616                                         if(qFileName != ""){
617                                                 currQual.printQScores(scrapQualFile);
618                                         }
619                                 }
620                                 count++;
621                         }
622                         
623                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
624                                 unsigned long int pos = inFASTA.tellg();
625                                 if ((pos == -1) || (pos >= line->end)) { break; }
626                         #else
627                                 if (inFASTA.eof()) { break; }
628                         #endif
629                                 
630                         //report progress
631                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
632                         
633                 }
634                 //report progress
635                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
636
637                 
638                 inFASTA.close();
639                 trimFASTAFile.close();
640                 scrapFASTAFile.close();
641                 if (oligoFile != "") {   outGroupsFile.close();   }
642                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
643                 
644                 return count;
645         }
646         catch(exception& e) {
647                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
648                 exit(1);
649         }
650 }
651
652 /**************************************************************************************************/
653
654 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
655         try {
656 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
657                 int process = 1;
658                 int exitCommand = 1;
659                 processIDS.clear();
660                 
661                 //loop through and create all the processes you want
662                 while (process != processors) {
663                         int pid = fork();
664                         
665                         if (pid > 0) {
666                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
667                                 process++;
668                         }else if (pid == 0){
669                                 
670                                 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
671                                 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
672
673                                 if(allFiles){
674                                         ofstream temp;
675
676                                         for(int i=0;i<tempFASTAFileNames.size();i++){
677                                                 for(int j=0;j<tempFASTAFileNames[i].size();j++){
678                                                         if (tempFASTAFileNames[i][j] != "") {
679                                                                 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
680                                                                 m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
681
682                                                                 if(qFileName != ""){
683                                                                         tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
684                                                                         m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
685                                                                 }
686                                                         }
687                                                 }
688                                         }
689                                 }
690                                                         
691                                 driverCreateTrim(filename,
692                                                                  qFileName,
693                                                                  (trimFASTAFileName + toString(getpid()) + ".temp"),
694                                                                  (scrapFASTAFileName + toString(getpid()) + ".temp"),
695                                                                  (trimQualFileName + toString(getpid()) + ".temp"),
696                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
697                                                                  (groupFile + toString(getpid()) + ".temp"),
698                                                                  tempFASTAFileNames,
699                                                                  tempPrimerQualFileNames,
700                                                                  lines[process],
701                                                                  qLines[process]);
702                                 
703                                 //pass groupCounts to parent
704                                 ofstream out;
705                                 string tempFile = filename + toString(getpid()) + ".num.temp";
706                                 m->openOutputFile(tempFile, out);
707                                 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
708                                         out << it->first << '\t' << it->second << endl;
709                                 }
710                                 out.close();
711                                 
712                                 exit(0);
713                         }else { 
714                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
715                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
716                                 exit(0);
717                         }
718                 }
719                 
720                 //parent do my part
721                 ofstream temp;
722                 m->openOutputFile(trimFASTAFileName, temp);             temp.close();
723                 m->openOutputFile(scrapFASTAFileName, temp);    temp.close();
724                 m->openOutputFile(trimQualFileName, temp);              temp.close();
725                 m->openOutputFile(scrapQualFileName, temp);             temp.close();
726
727                 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
728                 
729                 //force parent to wait until all the processes are done
730                 for (int i=0;i<processIDS.size();i++) { 
731                         int temp = processIDS[i];
732                         wait(&temp);
733                 }
734                 
735                 //append files
736                 for(int i=0;i<processIDS.size();i++){
737                         
738                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
739                         
740                         m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
741                         remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
742                         m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
743                         remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
744                         
745                         if(qFileName != ""){
746                                 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
747                                 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
748                                 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
749                                 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
750                         }
751                         
752                         m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
753                         remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
754                         
755                         
756                         if(allFiles){
757                                 for(int j=0;j<fastaFileNames.size();j++){
758                                         for(int k=0;k<fastaFileNames[j].size();k++){
759                                                 if (fastaFileNames[j][k] != "") {
760                                                         m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
761                                                         remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
762                                                         
763                                                         if(qFileName != ""){
764                                                                 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
765                                                                 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
766                                                         }
767                                                 }
768                                         }
769                                 }
770                         }
771                         
772                         ifstream in;
773                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
774                         m->openInputFile(tempFile, in);
775                         int tempNum;
776                         string group;
777                         while (!in.eof()) { 
778                                 in >> group >> tempNum; m->gobble(in);
779                                 
780                                 map<string, int>::iterator it = groupCounts.find(group);
781                                 if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
782                                 else { groupCounts[it->first] += tempNum; }
783                         }
784                         in.close(); remove(tempFile.c_str());
785                         
786                 }
787         
788                 return exitCommand;
789 #endif          
790         }
791         catch(exception& e) {
792                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
793                 exit(1);
794         }
795 }
796
797 /**************************************************************************************************/
798
799 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
800         try {
801                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
802                 //set file positions for fasta file
803                 fastaFilePos = m->divideFile(filename, processors);
804                 
805                 if (qfilename == "") { return processors; }
806                 
807                 //get name of first sequence in each chunk
808                 map<string, int> firstSeqNames;
809                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
810                         ifstream in;
811                         m->openInputFile(filename, in);
812                         in.seekg(fastaFilePos[i]);
813                 
814                         Sequence temp(in); 
815                         firstSeqNames[temp.getName()] = i;
816                 
817                         in.close();
818                 }
819                                 
820                 //seach for filePos of each first name in the qfile and save in qfileFilePos
821                 ifstream inQual;
822                 m->openInputFile(qfilename, inQual);
823                 
824                 string input;
825                 while(!inQual.eof()){   
826                         input = m->getline(inQual);
827
828                         if (input.length() != 0) {
829                                 if(input[0] == '>'){ //this is a sequence name line
830                                         istringstream nameStream(input);
831                                         
832                                         string sname = "";  nameStream >> sname;
833                                         sname = sname.substr(1);
834                                         
835                                         map<string, int>::iterator it = firstSeqNames.find(sname);
836                                         
837                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
838                                                 unsigned long int pos = inQual.tellg(); 
839                                                 qfileFilePos.push_back(pos - input.length() - 1);       
840                                                 firstSeqNames.erase(it);
841                                         }
842                                 }
843                         }
844                         
845                         if (firstSeqNames.size() == 0) { break; }
846                 }
847                 inQual.close();
848                 
849                 
850                 if (firstSeqNames.size() != 0) { 
851                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
852                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
853                         }
854                         qFileName = "";
855                         return processors;
856                 }
857
858                 //get last file position of qfile
859                 FILE * pFile;
860                 unsigned long int size;
861                 
862                 //get num bytes in file
863                 pFile = fopen (qfilename.c_str(),"rb");
864                 if (pFile==NULL) perror ("Error opening file");
865                 else{
866                         fseek (pFile, 0, SEEK_END);
867                         size=ftell (pFile);
868                         fclose (pFile);
869                 }
870                 
871                 qfileFilePos.push_back(size);
872                 
873                 return processors;
874                 
875                 #else
876                 
877                         fastaFilePos.push_back(0); qfileFilePos.push_back(0);
878                         //get last file position of fastafile
879                         FILE * pFile;
880                         unsigned long int size;
881                         
882                         //get num bytes in file
883                         pFile = fopen (filename.c_str(),"rb");
884                         if (pFile==NULL) perror ("Error opening file");
885                         else{
886                                 fseek (pFile, 0, SEEK_END);
887                                 size=ftell (pFile);
888                                 fclose (pFile);
889                         }
890                         fastaFilePos.push_back(size);
891                         
892                         //get last file position of fastafile
893                         FILE * qFile;
894                         
895                         //get num bytes in file
896                         qFile = fopen (qfilename.c_str(),"rb");
897                         if (qFile==NULL) perror ("Error opening file");
898                         else{
899                                 fseek (qFile, 0, SEEK_END);
900                                 size=ftell (qFile);
901                                 fclose (qFile);
902                         }
903                         qfileFilePos.push_back(size);
904                 
905                         return 1;
906                 
907                 #endif
908         }
909         catch(exception& e) {
910                 m->errorOut(e, "TrimSeqsCommand", "setLines");
911                 exit(1);
912         }
913 }
914
915 //***************************************************************************************************************
916
917 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
918         try {
919                 ifstream inOligos;
920                 m->openInputFile(oligoFile, inOligos);
921                 
922                 ofstream test;
923                 
924                 string type, oligo, group;
925
926                 int indexPrimer = 0;
927                 int indexBarcode = 0;
928                 
929                 while(!inOligos.eof()){
930
931                         inOligos >> type; 
932                                         
933                         if(type[0] == '#'){
934                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
935                                 m->gobble(inOligos);
936                         }
937                         else{
938                                 m->gobble(inOligos);
939                                 //make type case insensitive
940                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
941                                 
942                                 inOligos >> oligo;
943                                 
944                                 for(int i=0;i<oligo.length();i++){
945                                         oligo[i] = toupper(oligo[i]);
946                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
947                                 }
948                                 
949                                 if(type == "FORWARD"){
950                                         group = "";
951                                         
952                                         // get rest of line in case there is a primer name
953                                         while (!inOligos.eof()) {       
954                                                 char c = inOligos.get(); 
955                                                 if (c == 10 || c == 13){        break;  }
956                                                 else if (c == 32 || c == 9){;} //space or tab
957                                                 else {  group += c;  }
958                                         } 
959                                         
960                                         //check for repeat barcodes
961                                         map<string, int>::iterator itPrime = primers.find(oligo);
962                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
963                                         
964                                         primers[oligo]=indexPrimer; indexPrimer++;              
965                                         primerNameVector.push_back(group);
966                                 }
967                                 else if(type == "REVERSE"){
968                                         Sequence oligoRC("reverse", oligo);
969                                         oligoRC.reverseComplement();
970                                         revPrimer.push_back(oligoRC.getUnaligned());
971                                 }
972                                 else if(type == "BARCODE"){
973                                         inOligos >> group;
974                                         
975                                         //check for repeat barcodes
976                                         map<string, int>::iterator itBar = barcodes.find(oligo);
977                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
978                                                 
979                                         barcodes[oligo]=indexBarcode; indexBarcode++;
980                                         barcodeNameVector.push_back(group);
981                                 }
982                                 else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
983                         }
984                         m->gobble(inOligos);
985                 }       
986                 inOligos.close();
987                 
988                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
989                 
990                 //add in potential combos
991                 if(barcodeNameVector.size() == 0){
992                         barcodes[""] = 0;
993                         barcodeNameVector.push_back("");                        
994                 }
995                 
996                 if(primerNameVector.size() == 0){
997                         primers[""] = 0;
998                         primerNameVector.push_back("");                 
999                 }
1000                 
1001                 fastaFileNames.resize(barcodeNameVector.size());
1002                 for(int i=0;i<fastaFileNames.size();i++){
1003                         fastaFileNames[i].assign(primerNameVector.size(), "");
1004                 }
1005                 if(qFileName != ""){    qualFileNames = fastaFileNames; }
1006                 
1007                 if(allFiles){
1008                         set<string> uniqueNames; //used to cleanup outputFileNames
1009                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1010                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1011                                         
1012                                         string primerName = primerNameVector[itPrimer->second];
1013                                         string barcodeName = barcodeNameVector[itBar->second];
1014                                         
1015                                         string comboGroupName = "";
1016                                         string fastaFileName = "";
1017                                         string qualFileName = "";
1018                                         
1019                                         if(primerName == ""){
1020                                                 comboGroupName = barcodeNameVector[itBar->second];
1021                                         }
1022                                         else{
1023                                                 if(barcodeName == ""){
1024                                                         comboGroupName = primerNameVector[itPrimer->second];
1025                                                 }
1026                                                 else{
1027                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1028                                                 }
1029                                         }
1030
1031                                         ofstream temp;
1032                                         fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1033                                         if (uniqueNames.count(fastaFileName) == 0) {
1034                                                 outputNames.push_back(fastaFileName);
1035                                                 outputTypes["fasta"].push_back(fastaFileName);
1036                                                 uniqueNames.insert(fastaFileName);
1037                                         }
1038                                         
1039                                         fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1040                                         m->openOutputFile(fastaFileName, temp);         temp.close();
1041
1042                                         if(qFileName != ""){
1043                                                 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1044                                                 if (uniqueNames.count(fastaFileName) == 0) {
1045                                                         outputNames.push_back(qualFileName);
1046                                                         outputTypes["qfile"].push_back(qualFileName);
1047                                                 }
1048                                                 
1049                                                 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1050                                                 m->openOutputFile(qualFileName, temp);          temp.close();
1051                                         }
1052                                 }
1053                         }
1054                 }
1055                 numFPrimers = primers.size();
1056                 numRPrimers = revPrimer.size();
1057
1058         }
1059         catch(exception& e) {
1060                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1061                 exit(1);
1062         }
1063 }
1064
1065 //***************************************************************************************************************
1066
1067 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1068         try {
1069                 
1070                 string rawSequence = seq.getUnaligned();
1071                 int success = bdiffs + 1;       //guilty until proven innocent
1072                 
1073                 //can you find the barcode
1074                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1075                         string oligo = it->first;
1076                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1077                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1078                                 break;  
1079                         }
1080                         
1081                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1082                                 group = it->second;
1083                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1084                                 
1085                                 if(qual.getName() != ""){
1086                                         qual.trimQScores(oligo.length(), -1);
1087                                 }
1088                                 
1089                                 success = 0;
1090                                 break;
1091                         }
1092                 }
1093                 
1094                 //if you found the barcode or if you don't want to allow for diffs
1095                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1096                 
1097                 else { //try aligning and see if you can find it
1098
1099                         int maxLength = 0;
1100
1101                         Alignment* alignment;
1102                         if (barcodes.size() > 0) {
1103                                 map<string,int>::iterator it=barcodes.begin();
1104
1105                                 for(it;it!=barcodes.end();it++){
1106                                         if(it->first.length() > maxLength){
1107                                                 maxLength = it->first.length();
1108                                         }
1109                                 }
1110                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1111
1112                         }else{ alignment = NULL; } 
1113                         
1114                         //can you find the barcode
1115                         int minDiff = 1e6;
1116                         int minCount = 1;
1117                         int minGroup = -1;
1118                         int minPos = 0;
1119                         
1120                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1121                                 string oligo = it->first;
1122 //                              int length = oligo.length();
1123                                 
1124                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1125                                         success = bdiffs + 10;
1126                                         break;
1127                                 }
1128                                 
1129                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1130                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1131                                 oligo = alignment->getSeqAAln();
1132                                 string temp = alignment->getSeqBAln();
1133                 
1134                                 int alnLength = oligo.length();
1135                                 
1136                                 for(int i=oligo.length()-1;i>=0;i--){
1137                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1138                                 }
1139                                 oligo = oligo.substr(0,alnLength);
1140                                 temp = temp.substr(0,alnLength);
1141                                 
1142                                 int numDiff = countDiffs(oligo, temp);
1143                                 
1144                                 if(numDiff < minDiff){
1145                                         minDiff = numDiff;
1146                                         minCount = 1;
1147                                         minGroup = it->second;
1148                                         minPos = 0;
1149                                         for(int i=0;i<alnLength;i++){
1150                                                 if(temp[i] != '-'){
1151                                                         minPos++;
1152                                                 }
1153                                         }
1154                                 }
1155                                 else if(numDiff == minDiff){
1156                                         minCount++;
1157                                 }
1158
1159                         }
1160
1161                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1162                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1163                         else{                                                                                                   //use the best match
1164                                 group = minGroup;
1165                                 seq.setUnaligned(rawSequence.substr(minPos));
1166                                 
1167                                 if(qual.getName() != ""){
1168                                         qual.trimQScores(minPos, -1);
1169                                 }
1170                                 success = minDiff;
1171                         }
1172                         
1173                         if (alignment != NULL) {  delete alignment;  }
1174                         
1175                 }
1176                 
1177                 return success;
1178                 
1179         }
1180         catch(exception& e) {
1181                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1182                 exit(1);
1183         }
1184
1185 }
1186
1187 //***************************************************************************************************************
1188
1189 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1190         try {
1191                 string rawSequence = seq.getUnaligned();
1192                 int success = pdiffs + 1;       //guilty until proven innocent
1193                 
1194                 //can you find the primer
1195                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1196                         string oligo = it->first;
1197                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1198                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1199                                 break;  
1200                         }
1201                         
1202                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1203                                 group = it->second;
1204                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1205                                 if(qual.getName() != ""){
1206                                         qual.trimQScores(oligo.length(), -1);
1207                                 }
1208                                 success = 0;
1209                                 break;
1210                         }
1211                 }
1212
1213                 //if you found the barcode or if you don't want to allow for diffs
1214                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1215                 
1216                 else { //try aligning and see if you can find it
1217
1218                         int maxLength = 0;
1219
1220                         Alignment* alignment;
1221                         if (primers.size() > 0) {
1222                                 map<string,int>::iterator it=primers.begin();
1223
1224                                 for(it;it!=primers.end();it++){
1225                                         if(it->first.length() > maxLength){
1226                                                 maxLength = it->first.length();
1227                                         }
1228                                 }
1229                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1230
1231                         }else{ alignment = NULL; } 
1232                         
1233                         //can you find the barcode
1234                         int minDiff = 1e6;
1235                         int minCount = 1;
1236                         int minGroup = -1;
1237                         int minPos = 0;
1238                         
1239                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1240                                 string oligo = it->first;
1241 //                              int length = oligo.length();
1242                                 
1243                                 if(rawSequence.length() < maxLength){   
1244                                         success = pdiffs + 100;
1245                                         break;
1246                                 }
1247                                 
1248                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1249                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1250                                 oligo = alignment->getSeqAAln();
1251                                 string temp = alignment->getSeqBAln();
1252                 
1253                                 int alnLength = oligo.length();
1254                                 
1255                                 for(int i=oligo.length()-1;i>=0;i--){
1256                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1257                                 }
1258                                 oligo = oligo.substr(0,alnLength);
1259                                 temp = temp.substr(0,alnLength);
1260                                 
1261                                 int numDiff = countDiffs(oligo, temp);
1262                                 
1263                                 if(numDiff < minDiff){
1264                                         minDiff = numDiff;
1265                                         minCount = 1;
1266                                         minGroup = it->second;
1267                                         minPos = 0;
1268                                         for(int i=0;i<alnLength;i++){
1269                                                 if(temp[i] != '-'){
1270                                                         minPos++;
1271                                                 }
1272                                         }
1273                                 }
1274                                 else if(numDiff == minDiff){
1275                                         minCount++;
1276                                 }
1277
1278                         }
1279
1280                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1281                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1282                         else{                                                                                                   //use the best match
1283                                 group = minGroup;
1284                                 seq.setUnaligned(rawSequence.substr(minPos));
1285                                 if(qual.getName() != ""){
1286                                         qual.trimQScores(minPos, -1);
1287                                 }
1288                                 success = minDiff;
1289                         }
1290                         
1291                         if (alignment != NULL) {  delete alignment;  }
1292                         
1293                 }
1294                 
1295                 return success;
1296
1297         }
1298         catch(exception& e) {
1299                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1300                 exit(1);
1301         }
1302 }
1303
1304 //***************************************************************************************************************
1305
1306 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1307         try {
1308                 string rawSequence = seq.getUnaligned();
1309                 bool success = 0;       //guilty until proven innocent
1310                 
1311                 for(int i=0;i<numRPrimers;i++){
1312                         string oligo = revPrimer[i];
1313                         
1314                         if(rawSequence.length() < oligo.length()){
1315                                 success = 0;
1316                                 break;
1317                         }
1318                         
1319                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1320                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1321                                 if(qual.getName() != ""){
1322                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1323                                 }
1324                                 success = 1;
1325                                 break;
1326                         }
1327                 }       
1328                 return success;
1329                 
1330         }
1331         catch(exception& e) {
1332                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1333                 exit(1);
1334         }
1335 }
1336
1337 //***************************************************************************************************************
1338
1339 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1340         try {
1341                 bool success = 1;
1342                 if(qscores.getName() != ""){
1343                         qscores.trimQScores(-1, keepFirst);
1344                 }
1345                 sequence.trim(keepFirst);
1346                 return success;
1347         }
1348         catch(exception& e) {
1349                 m->errorOut(e, "keepFirstTrim", "countDiffs");
1350                 exit(1);
1351         }
1352         
1353 }       
1354
1355 //***************************************************************************************************************
1356
1357 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1358         try {
1359                 bool success = 0;
1360                 
1361                 int length = sequence.getNumBases() - removeLast;
1362                 
1363                 if(length > 0){
1364                         if(qscores.getName() != ""){
1365                                 qscores.trimQScores(-1, length);
1366                         }
1367                         sequence.trim(length);
1368                         success = 1;
1369                 }
1370                 else{
1371                         success = 0;
1372                 }
1373
1374                 return success;
1375         }
1376         catch(exception& e) {
1377                 m->errorOut(e, "removeLastTrim", "countDiffs");
1378                 exit(1);
1379         }
1380         
1381 }       
1382
1383 //***************************************************************************************************************
1384
1385 bool TrimSeqsCommand::cullLength(Sequence& seq){
1386         try {
1387         
1388                 int length = seq.getNumBases();
1389                 bool success = 0;       //guilty until proven innocent
1390                 
1391                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1392                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1393                 else                                                                                            {       success = 0;    }
1394                 
1395                 return success;
1396         
1397         }
1398         catch(exception& e) {
1399                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1400                 exit(1);
1401         }
1402         
1403 }
1404
1405 //***************************************************************************************************************
1406
1407 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1408         try {
1409                 int longHomoP = seq.getLongHomoPolymer();
1410                 bool success = 0;       //guilty until proven innocent
1411                 
1412                 if(longHomoP <= maxHomoP){      success = 1;    }
1413                 else                                    {       success = 0;    }
1414                 
1415                 return success;
1416         }
1417         catch(exception& e) {
1418                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1419                 exit(1);
1420         }
1421         
1422 }
1423
1424 //***************************************************************************************************************
1425
1426 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1427         try {
1428                 int numNs = seq.getAmbigBases();
1429                 bool success = 0;       //guilty until proven innocent
1430                 
1431                 if(numNs <= maxAmbig)   {       success = 1;    }
1432                 else                                    {       success = 0;    }
1433                 
1434                 return success;
1435         }
1436         catch(exception& e) {
1437                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1438                 exit(1);
1439         }
1440         
1441 }
1442
1443 //***************************************************************************************************************
1444
1445 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1446         try {
1447                 bool success = 1;
1448                 int length = oligo.length();
1449                 
1450                 for(int i=0;i<length;i++){
1451                         
1452                         if(oligo[i] != seq[i]){
1453                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1454                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1455                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1456                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1457                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1458                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1459                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1460                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1461                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1462                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1463                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1464                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1465                                 
1466                                 if(success == 0)        {       break;   }
1467                         }
1468                         else{
1469                                 success = 1;
1470                         }
1471                 }
1472                 
1473                 return success;
1474         }
1475         catch(exception& e) {
1476                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1477                 exit(1);
1478         }
1479
1480 }
1481
1482 //***************************************************************************************************************
1483
1484 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1485         try {
1486
1487                 int length = oligo.length();
1488                 int countDiffs = 0;
1489                 
1490                 for(int i=0;i<length;i++){
1491                                                                 
1492                         if(oligo[i] != seq[i]){
1493                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1494                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1495                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1496                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1497                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1498                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1499                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1500                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1501                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1502                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1503                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1504                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1505                         }
1506                         
1507                 }
1508                 
1509                 return countDiffs;
1510         }
1511         catch(exception& e) {
1512                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1513                 exit(1);
1514         }
1515
1516 }
1517
1518 //***************************************************************************************************************