]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
added group option to trim.seqs so you can provide your own groupfile instead of...
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::getValidParameters(){   
15         try {
16                 string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
17                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
18                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
19                 return myArray;
20         }
21         catch(exception& e) {
22                 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
23                 exit(1);
24         }
25 }
26 //**********************************************************************************************************************
27 TrimSeqsCommand::TrimSeqsCommand(){     
28         try {
29                 abort = true;
30                 //initialize outputTypes
31                 vector<string> tempOutNames;
32                 outputTypes["fasta"] = tempOutNames;
33                 outputTypes["qual"] = tempOutNames;
34                 outputTypes["group"] = tempOutNames;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 vector<string> TrimSeqsCommand::getRequiredParameters(){        
43         try {
44                 string Array[] =  {"fasta"};
45                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 vector<string> TrimSeqsCommand::getRequiredFiles(){     
55         try {
56                 vector<string> myArray;
57                 return myArray;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
61                 exit(1);
62         }
63 }
64 //***************************************************************************************************************
65
66 TrimSeqsCommand::TrimSeqsCommand(string option)  {
67         try {
68                 
69                 abort = false;
70                 comboStarts = 0;
71                 
72                 //allow user to run help
73                 if(option == "help") { help(); abort = true; }
74                 
75                 else {
76                         //valid paramters for this command
77                         string AlignArray[] =  {"fasta", "flip", "group","oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
78                                                                         "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
79                         
80                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
81                         
82                         OptionParser parser(option);
83                         map<string,string> parameters = parser.getParameters();
84                         
85                         ValidParameters validParameter;
86                         map<string,string>::iterator it;
87                         
88                         //check to make sure all parameters are valid for command
89                         for (it = parameters.begin(); it != parameters.end(); it++) { 
90                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
91                         }
92                         
93                         //initialize outputTypes
94                         vector<string> tempOutNames;
95                         outputTypes["fasta"] = tempOutNames;
96                         outputTypes["qual"] = tempOutNames;
97                         outputTypes["group"] = tempOutNames;
98                         
99                         //if the user changes the input directory command factory will send this info to us in the output parameter 
100                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
101                         if (inputDir == "not found"){   inputDir = "";          }
102                         else {
103                                 string path;
104                                 it = parameters.find("fasta");
105                                 //user has given a template file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
110                                 }
111                                 
112                                 it = parameters.find("oligos");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
118                                 }
119                                 
120                                 it = parameters.find("qfile");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
126                                 }
127                                 
128                                 it = parameters.find("group");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
134                                 }
135                         }
136
137                         
138                         //check for required parameters
139                         fastaFile = validParameter.validFile(parameters, "fasta", true);
140                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
141                         else if (fastaFile == "not open") { abort = true; }     
142                         
143                         //if the user changes the output directory command factory will send this info to us in the output parameter 
144                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
145                                 outputDir = ""; 
146                                 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it       
147                         }
148                 
149                         
150                         //check for optional parameter and set defaults
151                         // ...at some point should added some additional type checking...
152                         string temp;
153                         temp = validParameter.validFile(parameters, "flip", false);
154                         if (temp == "not found"){       flip = 0;       }
155                         else if(m->isTrue(temp))        {       flip = 1;       }
156                 
157                         temp = validParameter.validFile(parameters, "oligos", true);
158                         if (temp == "not found"){       oligoFile = "";         }
159                         else if(temp == "not open"){    abort = true;   } 
160                         else                                    {       oligoFile = temp;               }
161                         
162                         temp = validParameter.validFile(parameters, "group", true);
163                         if (temp == "not found"){       groupfile = "";         }
164                         else if(temp == "not open"){    abort = true;   } 
165                         else                                    {       groupfile = temp;               }
166                         
167                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
168                         convert(temp, maxAmbig);  
169
170                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
171                         convert(temp, maxHomoP);  
172
173                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
174                         convert(temp, minLength); 
175                         
176                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
177                         convert(temp, maxLength);
178                         
179                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
180                         convert(temp, bdiffs);
181                         
182                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
183                         convert(temp, pdiffs);
184                         
185                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
186                         convert(temp, tdiffs);
187                         
188                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
189                         
190                         temp = validParameter.validFile(parameters, "qfile", true);     
191                         if (temp == "not found")        {       qFileName = "";         }
192                         else if(temp == "not open")     {       abort = true;           }
193                         else                                            {       qFileName = temp;       }
194                         
195                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
196                         convert(temp, qThreshold);
197                         
198                         temp = validParameter.validFile(parameters, "qtrim", false);            if (temp == "not found") { temp = "F"; }
199                         qtrim = m->isTrue(temp);
200
201                         temp = validParameter.validFile(parameters, "rollaverage", false);      if (temp == "not found") { temp = "0"; }
202                         convert(temp, qRollAverage);
203
204                         temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205                         convert(temp, qWindowAverage);
206
207                         temp = validParameter.validFile(parameters, "qwindowsize", false);      if (temp == "not found") { temp = "100"; }
208                         convert(temp, qWindowSize);
209
210                         temp = validParameter.validFile(parameters, "qstepsize", false);                if (temp == "not found") { temp = "10"; }
211                         convert(temp, qWindowStep);
212
213                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
214                         convert(temp, qAverage);
215                         
216                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
217                         allFiles = m->isTrue(temp);
218                         
219                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
220                         convert(temp, processors); 
221                         
222                         if ((oligoFile != "") && (groupfile != "")) {
223                                 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
224                         }
225                                                                                                 
226                         
227                         if(allFiles && (oligoFile == "") && (groupfile == "")){
228                                 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file.  Ignoring the allfiles request."); m->mothurOutEndLine();
229                         }
230                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
231                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
232                                 qAverage=0;
233                                 qThreshold=0;
234                         }
235                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
236                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
237                                 abort = true;
238                         }
239                 }
240
241         }
242         catch(exception& e) {
243                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
244                 exit(1);
245         }
246 }
247 //**********************************************************************************************************************
248
249 void TrimSeqsCommand::help(){
250         try {
251                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
252                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
253                 m->mothurOut("The fasta parameter is required.\n");
254                 m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n");
255                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
256                 m->mothurOut("The oligos parameter .... The default is "".\n");
257                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
258                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
259                 m->mothurOut("The minlength parameter .... The default is 0.\n");
260                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
261                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
262                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
263                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
264                 m->mothurOut("The qfile parameter .....\n");
265                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
266                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
267                 m->mothurOut("The allfiles parameter .... The default is F.\n");
268                 m->mothurOut("The qtrim parameter .... The default is F.\n");
269                 m->mothurOut("The trim.seqs command should be in the following format: \n");
270                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
271                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
272                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
273                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
274                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
275
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "TrimSeqsCommand", "help");
279                 exit(1);
280         }
281 }
282
283
284 //***************************************************************************************************************
285
286 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
287
288 //***************************************************************************************************************
289
290 int TrimSeqsCommand::execute(){
291         try{
292         
293                 if (abort == true) { return 0; }
294                 
295                 numFPrimers = 0;  //this needs to be initialized
296                 numRPrimers = 0;
297                 vector<string> fastaFileNames;
298                 vector<string> qualFileNames;
299                 
300                 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
301                 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
302                 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
303                 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
304                 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
305                 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
306                 if (qFileName != "") {  outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile);  outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); }
307                 string groupFile = "";
308                 if (groupfile == "") { groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; }
309                 else{
310                         groupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "trim.groups";
311                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
312                         groupMap = new GroupMap(groupfile);
313                         groupMap->readMap();
314                         
315                         if(allFiles){
316                                 for (int i = 0; i < groupMap->namesOfGroups.size(); i++) {
317                                         groupToIndex[groupMap->namesOfGroups[i]] = i;
318                                         groupVector.push_back(groupMap->namesOfGroups[i]);
319                                         fastaFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) +  groupMap->namesOfGroups[i] + ".fasta"));
320                                         
321                                         //we append later, so we want to clear file
322                                         ofstream outRemove;
323                                         m->openOutputFile(fastaFileNames[i], outRemove);
324                                         outRemove.close();
325                                         if(qFileName != ""){
326                                                 qualFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) +  groupMap->namesOfGroups[i] + ".qual"));
327                                                 ofstream outRemove2;
328                                                 m->openOutputFile(qualFileNames[i], outRemove2);
329                                                 outRemove2.close();
330                                         }
331                                 }
332                         }
333                         comboStarts = fastaFileNames.size()-1;
334                 }
335                 
336                 if(oligoFile != ""){
337                         outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile);
338                         getOligos(fastaFileNames, qualFileNames);
339                 }
340
341                 vector<unsigned long int> fastaFilePos;
342                 vector<unsigned long int> qFilePos;
343                 
344                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
345                 
346                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
347                         lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
348                         if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
349                 }       
350                 if(qFileName == "")     {       qLines = lines; } //files with duds
351                 
352                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
353                                 if(processors == 1){
354                                                                                 
355                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
356                                         
357                                         for (int j = 0; j < fastaFileNames.size(); j++) {
358                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
359                                         }
360                                         if(qFileName != ""){
361                                                 for (int j = 0; j < qualFileNames.size(); j++) {
362                                                         rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str());
363                                                 }
364                                         }                                               
365
366                                 }else{
367                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); 
368                                         
369                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
370                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
371                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
372                                         
373                                         if(qFileName != ""){
374                                                 rename((trimQualFile + toString(processIDS[0]) + ".temp").c_str(), trimQualFile.c_str());
375                                                 rename((scrapQualFile + toString(processIDS[0]) + ".temp").c_str(), scrapQualFile.c_str());
376                                         }
377                                         
378                                         
379                                         for (int j = 0; j < fastaFileNames.size(); j++) {
380                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
381                                         }
382                                         if(qFileName != ""){
383                                                 for (int j = 0; j < qualFileNames.size(); j++) {
384                                                         rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str());
385                                                 }
386                                         }                                               
387                                         
388                                         //append files
389                                         for(int i=1;i<processors;i++){
390                                                 m->appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
391                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
392                                                 m->appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
393                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
394
395                                                 m->appendFiles((trimQualFile + toString(processIDS[i]) + ".temp"), trimQualFile);
396                                                 remove((trimQualFile + toString(processIDS[i]) + ".temp").c_str());
397                                                 m->appendFiles((scrapQualFile + toString(processIDS[i]) + ".temp"), scrapQualFile);
398                                                 remove((scrapQualFile + toString(processIDS[i]) + ".temp").c_str());
399                                                 
400                                                 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
401                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
402                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
403                                                         m->appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
404                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
405                                                 }
406                                                 
407                                                 if(qFileName != ""){
408                                                         for (int j = 0; j < qualFileNames.size(); j++) {
409                                                                 m->appendFiles((qualFileNames[j] + toString(processIDS[i]) + ".temp"), qualFileNames[j]);
410                                                                 remove((qualFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
411                                                         }
412                                                 }                                               
413                                                 
414                                                 
415                                         }
416                                 }
417                                 
418                                 if (m->control_pressed) {  return 0; }
419                 #else
420                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
421                                 
422                                 for (int j = 0; j < fastaFileNames.size(); j++) {
423                                         rename((fastaFileNames[j] + toString(j) + ".temp").c_str(), fastaFileNames[j].c_str());
424                                 }
425                                 if(qFileName != ""){
426                                         for (int j = 0; j < qualFileNames.size(); j++) {
427                                                 rename((qualFileNames[j] + toString(j) + ".temp").c_str(), qualFileNames[j].c_str());
428                                         }
429                                 }
430                                                                         
431                                 if (m->control_pressed) {  return 0; }
432                 #endif
433                                         
434                                                                                 
435                 for(int i=0;i<fastaFileNames.size();i++){
436                         if (m->isBlank(fastaFileNames[i])) { remove(fastaFileNames[i].c_str()); }
437                         else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); }
438                         else {
439                                 ifstream inFASTA;
440                                 string seqName;
441                                 m->openInputFile(fastaFileNames[i], inFASTA);
442                                 ofstream outGroups;
443                                 string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups";
444                                 m->openOutputFile(outGroupFilename, outGroups);
445                                 outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename);  
446                                 
447                                 string thisGroup = "";
448                                 if (i > comboStarts) {
449                                         map<string, int>::iterator itCombo;
450                                         for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
451                                                 if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
452                                         }
453                                 }else{ thisGroup = groupVector[i]; }
454                                 
455                                 while(!inFASTA.eof()){
456                                         if(inFASTA.get() == '>'){
457                                                 inFASTA >> seqName;
458                                                 outGroups << seqName << '\t' << thisGroup << endl;
459                                         }
460                                         while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
461                                 }
462                                 outGroups.close();
463                                 inFASTA.close();
464                         }
465                 }
466                 
467                 if(qFileName != ""){
468                         for(int i=0;i<qualFileNames.size();i++){
469                                 if (m->isBlank(qualFileNames[i])) { remove(qualFileNames[i].c_str());   }
470                                 else if (filesToRemove.count(qualFileNames[i]) > 0) { remove(qualFileNames[i].c_str()); }
471                                 else {
472                                         ifstream inQual;
473                                         string seqName;
474                                         m->openInputFile(qualFileNames[i], inQual);
475 //                                      ofstream outGroups;
476 //                                      
477 //                                      string thisGroup = "";
478 //                                      if (i > comboStarts) {
479 //                                              map<string, int>::iterator itCombo;
480 //                                              for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){
481 //                                                      if(itCombo->second == i){       thisGroup = itCombo->first;     combos.erase(itCombo);  break;  }
482 //                                              }
483 //                                      }
484 //                                      else{ thisGroup = groupVector[i]; }
485                                         
486                                         inQual.close();
487                                 }
488                         }
489                 }
490                 
491                 
492                 if (m->control_pressed) { 
493                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
494                         return 0;
495                 }
496
497                 m->mothurOutEndLine();
498                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
499                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
500                 m->mothurOutEndLine();
501                 
502                 return 0;       
503                         
504         }
505         catch(exception& e) {
506                 m->errorOut(e, "TrimSeqsCommand", "execute");
507                 exit(1);
508         }
509 }
510                 
511 /**************************************************************************************/
512
513 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames, linePair* line, linePair* qline) {      
514                 
515         try {
516                 
517                 ofstream outFASTA;
518                 int able = m->openOutputFile(trimFile, outFASTA);
519                 
520                 ofstream scrapFASTA;
521                 m->openOutputFile(scrapFile, scrapFASTA);
522                 
523                 ofstream outQual;
524                 ofstream scrapQual;
525                 if(qFileName != ""){
526                         m->openOutputFile(trimQFile, outQual);
527                         m->openOutputFile(scrapQFile, scrapQual);
528                 }
529                 
530                 ofstream outGroups;
531                 //vector<ofstream*> fastaFileNames;
532                 //vector<ofstream*> qualFileNames;
533                 
534                 if (oligoFile != "") {          
535                         m->openOutputFile(groupFile, outGroups);   
536                         for (int i = 0; i < fastaNames.size(); i++) {
537
538                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
539                                 fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp");
540                                 //fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
541                                 //clear old file if it exists
542                                 ofstream temp;
543                                 m->openOutputFile(fastaNames[i], temp);
544                                 temp.close();
545                                 if(qFileName != ""){
546                                         qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp");
547                                         //qualFileNames.push_back(new ofstream((qualNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
548                                         //clear old file if it exists
549                                         ofstream temp2;
550                                         m->openOutputFile(qualNames[i], temp2);
551                                         temp2.close();
552                                 }
553                         #else
554                                 //fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); 
555                                 fastaNames[i] = (fastaNames[i] + toString(i) + ".temp");
556                                 ofstream temp;
557                                 m->openOutputFile(fastaNames[i], temp);
558                                 temp.close();                   
559                                 if(qFileName != ""){
560                                         //qualFileNames.push_back(new ofstream((qualNames[i] + toString(i) + ".temp").c_str(), ios::ate));      
561                                         qualNames[i] = (qualNames[i] + toString(i) + ".temp");
562                                         ofstream temp2;
563                                         m->openOutputFile(qualNames[i], temp2);
564                                         temp2.close();          
565                                 }
566                         #endif
567                         }
568                 }
569                 
570                 ifstream inFASTA;
571                 m->openInputFile(filename, inFASTA);
572                 inFASTA.seekg(line->start);
573                 
574                 ifstream qFile;
575                 if(qFileName != "")     {       m->openInputFile(qFileName, qFile);     qFile.seekg(qline->start);  }
576                 
577                 bool done = false;
578                 int count = 0;
579         
580                 while (!done) {
581                                 
582                         if (m->control_pressed) { 
583                                 inFASTA.close(); outFASTA.close(); scrapFASTA.close();
584                                 if (oligoFile != "") {   outGroups.close();   }
585                                 
586                                 //for(int i=0;i<fastaFileNames.size();i++){  fastaFileNames[i]->close(); delete fastaFileNames[i];  }   
587
588                                 if(qFileName != ""){
589                                         qFile.close();
590                                         //for(int i=0;i<qualFileNames.size();i++){  qualFileNames[i]->close(); delete qualFileNames[i];  }      
591                                 }
592                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
593
594                                 return 0;
595                         }
596                         
597                         int success = 1;
598                         
599
600                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
601
602                         QualityScores currQual;
603                         if(qFileName != ""){
604                                 currQual = QualityScores(qFile, currSeq.getNumBases());  m->gobble(qFile);
605                         }
606                         
607                         string origSeq = currSeq.getUnaligned();
608                         if (origSeq != "") {
609                                 int groupBar, groupPrime;
610                                 string trashCode = "";
611                                 int currentSeqsDiffs = 0;
612
613                                 if(qFileName != ""){
614                                         if(qThreshold != 0)                     {       success = currQual.stripQualThreshold(currSeq, qThreshold);                     }
615                                         else if(qAverage != 0)          {       success = currQual.cullQualAverage(currSeq, qAverage);                          }
616                                         else if(qRollAverage != 0)      {       success = currQual.stripQualRollingAverage(currSeq, qRollAverage);      }
617                                         else if(qWindowAverage != 0){   success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage);   }
618
619                                         if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
620                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
621                                         }
622
623                                         if(!success)                            {       trashCode += 'q';       }
624                                 }
625                         
626                                 if(barcodes.size() != 0){
627                                         success = stripBarcode(currSeq, currQual, groupBar);
628                                         if(success > bdiffs)            {       trashCode += 'b';       }
629                                         else{ currentSeqsDiffs += success;  }
630                                 }
631
632                                 if(numFPrimers != 0){
633                                         success = stripForward(currSeq, currQual, groupPrime);
634                                         if(success > pdiffs)            {       trashCode += 'f';       }
635                                         else{ currentSeqsDiffs += success;  }
636                                 }
637                                 
638                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
639
640                                 if(numRPrimers != 0){
641                                         success = stripReverse(currSeq, currQual);
642                                         if(!success)                            {       trashCode += 'r';       }
643                                 }
644                 
645                                 if(minLength > 0 || maxLength > 0){
646                                         success = cullLength(currSeq);
647                                         if(!success)                            {       trashCode += 'l';       }
648                                 }
649                                 if(maxHomoP > 0){
650                                         success = cullHomoP(currSeq);
651                                         if(!success)                            {       trashCode += 'h';       }
652                                 }
653                                 if(maxAmbig != -1){
654                                         success = cullAmbigs(currSeq);
655                                         if(!success)                            {       trashCode += 'n';       }
656                                 }
657                                 
658                                 if(flip){       currSeq.reverseComplement();            currQual.flipQScores(); }               // should go last                       
659                                 
660                                 if(trashCode.length() == 0){
661                                         currSeq.setAligned(currSeq.getUnaligned());
662                                         currSeq.printSequence(outFASTA);
663                                         currQual.printQScores(outQual);
664                                         
665                                         if(barcodes.size() != 0){
666                                                 string thisGroup = groupVector[groupBar];
667                                                 int indexToFastaFile = groupBar;
668                                                 if (primers.size() != 0){
669                                                         //does this primer have a group
670                                                         if (groupVector[groupPrime] != "") {  
671                                                                 thisGroup += "." + groupVector[groupPrime]; 
672                                                                 indexToFastaFile = combos[thisGroup];
673                                                         }
674                                                 }
675                                                 outGroups << currSeq.getName() << '\t' << thisGroup << endl;
676                                                 if(allFiles){
677                                                         ofstream outTemp;
678                                                         m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp);
679                                                         //currSeq.printSequence(*fastaFileNames[indexToFastaFile]);
680                                                         currSeq.printSequence(outTemp);
681                                                         outTemp.close();
682                                                         
683                                                         if(qFileName != ""){
684                                                                 //currQual.printQScores(*qualFileNames[indexToFastaFile]);
685                                                                 ofstream outTemp2;
686                                                                 m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2);
687                                                                 currQual.printQScores(outTemp2);
688                                                                 outTemp2.close();                                                       
689                                                         }
690                                                 }
691                                         }
692                                         
693                                         if (groupfile != "") {
694                                                 string thisGroup = groupMap->getGroup(currSeq.getName());
695                                                 
696                                                 if (thisGroup != "not found") {
697                                                         outGroups << currSeq.getName() << '\t' << thisGroup << endl;
698                                                         if (allFiles) {
699                                                                 ofstream outTemp;
700                                                                 m->openOutputFileAppend(fastaNames[groupToIndex[thisGroup]], outTemp);
701                                                                 currSeq.printSequence(outTemp);
702                                                                 outTemp.close();
703                                                                 if(qFileName != ""){
704                                                                         ofstream outTemp2;
705                                                                         m->openOutputFileAppend(qualNames[groupToIndex[thisGroup]], outTemp2);
706                                                                         currQual.printQScores(outTemp2);
707                                                                         outTemp2.close();                                                       
708                                                                 }
709                                                         }
710                                                 }else{
711                                                         m->mothurOut(currSeq.getName() + " is not in your groupfile, adding to group XXX."); m->mothurOutEndLine();
712                                                         outGroups << currSeq.getName() << '\t' << "XXX" << endl;
713                                                         if (allFiles) {  
714                                                                 m->mothurOut("[ERROR]: " + currSeq.getName() + " will not be added to any .group.fasta or .group.qual file."); m->mothurOutEndLine();
715                                                         }
716                                                 }
717                                         }
718                                 }
719                                 else{
720                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
721                                         currSeq.setUnaligned(origSeq);
722                                         currSeq.setAligned(origSeq);
723                                         currSeq.printSequence(scrapFASTA);
724                                         currQual.printQScores(scrapQual);
725                                 }
726                                 count++;
727                         }
728                         
729                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
730                                 unsigned long int pos = inFASTA.tellg();
731                                 if ((pos == -1) || (pos >= line->end)) { break; }
732                         #else
733                                 if (inFASTA.eof()) { break; }
734                         #endif
735                                 
736                         //report progress
737                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
738                         
739                 }
740                 //report progress
741                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
742
743                 
744                 inFASTA.close();
745                 outFASTA.close();
746                 scrapFASTA.close();
747                 if (oligoFile != "") {   outGroups.close();   }
748                 if(qFileName != "")     {       qFile.close();  scrapQual.close(); outQual.close();     }
749                 
750                 //for(int i=0;i<fastaFileNames.size();i++){
751                 //      fastaFileNames[i]->close();
752                 //      delete fastaFileNames[i];
753                 //}             
754                 
755                 //if(qFileName != ""){
756                         //for(int i=0;i<qualFileNames.size();i++){
757                                 //qualFileNames[i]->close();
758                                 //delete qualFileNames[i];
759                         //}             
760                 //}                     
761                 
762                 return count;
763         }
764         catch(exception& e) {
765                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
766                 exit(1);
767         }
768 }
769
770 /**************************************************************************************************/
771
772 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector<string> fastaNames, vector<string> qualNames) {
773         try {
774 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
775                 int process = 0;
776                 int exitCommand = 1;
777                 processIDS.clear();
778                 
779                 //loop through and create all the processes you want
780                 while (process != processors) {
781                         int pid = fork();
782                         
783                         if (pid > 0) {
784                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
785                                 process++;
786                         }else if (pid == 0){
787                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]);
788                                 exit(0);
789                         }else { 
790                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
791                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
792                                 exit(0);
793                         }
794                 }
795                 
796                 //force parent to wait until all the processes are done
797                 for (int i=0;i<processors;i++) { 
798                         int temp = processIDS[i];
799                         wait(&temp);
800                 }
801                 
802                 return exitCommand;
803 #endif          
804         }
805         catch(exception& e) {
806                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
807                 exit(1);
808         }
809 }
810
811 /**************************************************************************************************/
812
813 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
814         try {
815                 
816                 //set file positions for fasta file
817                 fastaFilePos = m->divideFile(filename, processors);
818                 
819                 if (qfilename == "") { return processors; }
820                 
821                 //get name of first sequence in each chunk
822                 map<string, int> firstSeqNames;
823                 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
824                         ifstream in;
825                         m->openInputFile(filename, in);
826                         in.seekg(fastaFilePos[i]);
827                 
828                         Sequence temp(in); 
829                         firstSeqNames[temp.getName()] = i;
830                 
831                         in.close();
832                 }
833                                 
834                 //seach for filePos of each first name in the qfile and save in qfileFilePos
835                 ifstream inQual;
836                 m->openInputFile(qfilename, inQual);
837                 
838                 string input;
839                 while(!inQual.eof()){   
840                         input = m->getline(inQual);
841
842                         if (input.length() != 0) {
843                                 if(input[0] == '>'){ //this is a sequence name line
844                                         istringstream nameStream(input);
845                                         
846                                         string sname = "";  nameStream >> sname;
847                                         sname = sname.substr(1);
848                                         
849                                         map<string, int>::iterator it = firstSeqNames.find(sname);
850                                         
851                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
852                                                 unsigned long int pos = inQual.tellg(); 
853                                                 qfileFilePos.push_back(pos - input.length() - 1);       
854                                                 firstSeqNames.erase(it);
855                                         }
856                                 }
857                         }
858                         
859                         if (firstSeqNames.size() == 0) { break; }
860                 }
861                 inQual.close();
862                 
863                 
864                 if (firstSeqNames.size() != 0) { 
865                         for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
866                                 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
867                         }
868                         qFileName = "";
869                         return processors;
870                 }
871
872                 //get last file position of qfile
873                 FILE * pFile;
874                 unsigned long int size;
875                 
876                 //get num bytes in file
877                 pFile = fopen (qfilename.c_str(),"rb");
878                 if (pFile==NULL) perror ("Error opening file");
879                 else{
880                         fseek (pFile, 0, SEEK_END);
881                         size=ftell (pFile);
882                         fclose (pFile);
883                 }
884                 
885                 qfileFilePos.push_back(size);
886                 
887                 return processors;
888         }
889         catch(exception& e) {
890                 m->errorOut(e, "TrimSeqsCommand", "setLines");
891                 exit(1);
892         }
893 }
894 //***************************************************************************************************************
895
896 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec, vector<string>& outQualVec){
897         try {
898                 ifstream inOligos;
899                 m->openInputFile(oligoFile, inOligos);
900                 
901                 ofstream test;
902                 
903                 string type, oligo, group;
904                 int index=0;
905                 //int indexPrimer = 0;
906                 
907                 while(!inOligos.eof()){
908                         inOligos >> type;
909                                         
910                         if(type[0] == '#'){
911                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
912                         }
913                         else{
914                                 //make type case insensitive
915                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
916                                 
917                                 inOligos >> oligo;
918                                 
919                                 for(int i=0;i<oligo.length();i++){
920                                         oligo[i] = toupper(oligo[i]);
921                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
922                                 }
923                                 
924                                 if(type == "FORWARD"){
925                                         group = "";
926                                         
927                                         // get rest of line in case there is a primer name
928                                         while (!inOligos.eof()) {       
929                                                 char c = inOligos.get(); 
930                                                 if (c == 10 || c == 13){        break;  }
931                                                 else if (c == 32 || c == 9){;} //space or tab
932                                                 else {  group += c;  }
933                                         } 
934                                         
935                                         //check for repeat barcodes
936                                         map<string, int>::iterator itPrime = primers.find(oligo);
937                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
938                                         
939                                         primers[oligo]=index; index++;
940                                         groupVector.push_back(group);
941                                         
942                                         if(allFiles){
943                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
944                                                 if(qFileName != ""){
945                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
946                                                 }
947                                                 if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct
948                                                         filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
949                                                         if(qFileName != ""){
950                                                                 filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
951                                                         }
952                                                 }else {
953                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
954                                                         outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
955                                                         if(qFileName != ""){
956                                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
957                                                                 outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
958                                                         }                                                       
959                                                 }
960                                         }
961
962                                 }
963                                 else if(type == "REVERSE"){
964                                         Sequence oligoRC("reverse", oligo);
965                                         oligoRC.reverseComplement();
966                                         revPrimer.push_back(oligoRC.getUnaligned());
967                                 }
968                                 else if(type == "BARCODE"){
969                                         inOligos >> group;
970                                         
971                                         //check for repeat barcodes
972                                         map<string, int>::iterator itBar = barcodes.find(oligo);
973                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
974                                         
975                                         barcodes[oligo]=index; index++;
976                                         groupVector.push_back(group);
977                                         
978                                         if(allFiles){
979                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
980                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
981                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta"));
982                                                 if(qFileName != ""){
983                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
984                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
985                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual"));
986                                                 }                                                       
987                                         }
988                                 }else{  m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
989                         }
990                         m->gobble(inOligos);
991                 }
992                 
993                 inOligos.close();
994                 
995                 //add in potential combos
996                 if(allFiles){
997                         comboStarts = outFASTAVec.size()-1;
998                         for (map<string, int>::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) {
999                                 for (map<string, int>::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) {
1000                                         if (groupVector[itPrime->second] != "") { //there is a group for this primer
1001                                                 outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1002                                                 outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1003                                                 outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta"));
1004                                                 combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1;
1005                                                 
1006                                                 if(qFileName != ""){
1007                                                         outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1008                                                         outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1009                                                         outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual"));
1010                                                 }
1011                                         }
1012                                 }
1013                         }
1014                 }
1015                 
1016                 numFPrimers = primers.size();
1017                 numRPrimers = revPrimer.size();
1018                 
1019         }
1020         catch(exception& e) {
1021                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1022                 exit(1);
1023         }
1024 }
1025 //***************************************************************************************************************
1026
1027 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1028         try {
1029                 
1030                 string rawSequence = seq.getUnaligned();
1031                 int success = bdiffs + 1;       //guilty until proven innocent
1032                 
1033                 //can you find the barcode
1034                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1035                         string oligo = it->first;
1036                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
1037                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1038                                 break;  
1039                         }
1040                         
1041                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1042                                 group = it->second;
1043                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1044                                 
1045                                 if(qual.getName() != ""){
1046                                         qual.trimQScores(oligo.length(), -1);
1047                                 }
1048                                 
1049                                 success = 0;
1050                                 break;
1051                         }
1052                 }
1053                 
1054                 //if you found the barcode or if you don't want to allow for diffs
1055 //              cout << success;
1056                 if ((bdiffs == 0) || (success == 0)) { return success;  }
1057                 
1058                 else { //try aligning and see if you can find it
1059 //                      cout << endl;
1060
1061                         int maxLength = 0;
1062
1063                         Alignment* alignment;
1064                         if (barcodes.size() > 0) {
1065                                 map<string,int>::iterator it=barcodes.begin();
1066
1067                                 for(it;it!=barcodes.end();it++){
1068                                         if(it->first.length() > maxLength){
1069                                                 maxLength = it->first.length();
1070                                         }
1071                                 }
1072                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
1073
1074                         }else{ alignment = NULL; } 
1075                         
1076                         //can you find the barcode
1077                         int minDiff = 1e6;
1078                         int minCount = 1;
1079                         int minGroup = -1;
1080                         int minPos = 0;
1081                         
1082                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1083                                 string oligo = it->first;
1084 //                              int length = oligo.length();
1085                                 
1086                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
1087                                         success = bdiffs + 10;
1088                                         break;
1089                                 }
1090                                 
1091                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1092                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1093                                 oligo = alignment->getSeqAAln();
1094                                 string temp = alignment->getSeqBAln();
1095                 
1096                                 int alnLength = oligo.length();
1097                                 
1098                                 for(int i=oligo.length()-1;i>=0;i--){
1099                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1100                                 }
1101                                 oligo = oligo.substr(0,alnLength);
1102                                 temp = temp.substr(0,alnLength);
1103                                 
1104                                 int newStart=0;
1105                                 int numDiff = countDiffs(oligo, temp);
1106                                 
1107 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1108                                 
1109                                 if(numDiff < minDiff){
1110                                         minDiff = numDiff;
1111                                         minCount = 1;
1112                                         minGroup = it->second;
1113                                         minPos = 0;
1114                                         for(int i=0;i<alnLength;i++){
1115                                                 if(temp[i] != '-'){
1116                                                         minPos++;
1117                                                 }
1118                                         }
1119                                 }
1120                                 else if(numDiff == minDiff){
1121                                         minCount++;
1122                                 }
1123
1124                         }
1125
1126                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
1127                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
1128                         else{                                                                                                   //use the best match
1129                                 group = minGroup;
1130                                 seq.setUnaligned(rawSequence.substr(minPos));
1131                                 
1132                                 if(qual.getName() != ""){
1133                                         qual.trimQScores(minPos, -1);
1134                                 }
1135                                 success = minDiff;
1136                         }
1137                         
1138                         if (alignment != NULL) {  delete alignment;  }
1139                         
1140                 }
1141 //              cout << success << endl;
1142                 
1143                 return success;
1144                 
1145         }
1146         catch(exception& e) {
1147                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1148                 exit(1);
1149         }
1150
1151 }
1152
1153 //***************************************************************************************************************
1154
1155 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1156         try {
1157                 string rawSequence = seq.getUnaligned();
1158                 int success = pdiffs + 1;       //guilty until proven innocent
1159                 
1160                 //can you find the primer
1161                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1162                         string oligo = it->first;
1163                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
1164                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
1165                                 break;  
1166                         }
1167                         
1168                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1169                                 group = it->second;
1170                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
1171                                 if(qual.getName() != ""){
1172                                         qual.trimQScores(oligo.length(), -1);
1173                                         
1174                                 }
1175                                 success = 0;
1176                                 break;
1177                         }
1178                 }
1179
1180                 //if you found the barcode or if you don't want to allow for diffs
1181 //              cout << success;
1182                 if ((pdiffs == 0) || (success == 0)) { return success;  }
1183                 
1184                 else { //try aligning and see if you can find it
1185 //                      cout << endl;
1186
1187                         int maxLength = 0;
1188
1189                         Alignment* alignment;
1190                         if (primers.size() > 0) {
1191                                 map<string,int>::iterator it=primers.begin();
1192
1193                                 for(it;it!=primers.end();it++){
1194                                         if(it->first.length() > maxLength){
1195                                                 maxLength = it->first.length();
1196                                         }
1197                                 }
1198                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
1199
1200                         }else{ alignment = NULL; } 
1201                         
1202                         //can you find the barcode
1203                         int minDiff = 1e6;
1204                         int minCount = 1;
1205                         int minGroup = -1;
1206                         int minPos = 0;
1207                         
1208                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1209                                 string oligo = it->first;
1210 //                              int length = oligo.length();
1211                                 
1212                                 if(rawSequence.length() < maxLength){   
1213                                         success = pdiffs + 100;
1214                                         break;
1215                                 }
1216                                 
1217                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1218                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1219                                 oligo = alignment->getSeqAAln();
1220                                 string temp = alignment->getSeqBAln();
1221                 
1222                                 int alnLength = oligo.length();
1223                                 
1224                                 for(int i=oligo.length()-1;i>=0;i--){
1225                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
1226                                 }
1227                                 oligo = oligo.substr(0,alnLength);
1228                                 temp = temp.substr(0,alnLength);
1229                                 
1230                                 int newStart=0;
1231                                 int numDiff = countDiffs(oligo, temp);
1232                                 
1233 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
1234                                 
1235                                 if(numDiff < minDiff){
1236                                         minDiff = numDiff;
1237                                         minCount = 1;
1238                                         minGroup = it->second;
1239                                         minPos = 0;
1240                                         for(int i=0;i<alnLength;i++){
1241                                                 if(temp[i] != '-'){
1242                                                         minPos++;
1243                                                 }
1244                                         }
1245                                 }
1246                                 else if(numDiff == minDiff){
1247                                         minCount++;
1248                                 }
1249
1250                         }
1251
1252                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
1253                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
1254                         else{                                                                                                   //use the best match
1255                                 group = minGroup;
1256                                 seq.setUnaligned(rawSequence.substr(minPos));
1257                                 if(qual.getName() != ""){
1258                                         qual.trimQScores(minPos, -1);
1259                                 }
1260                                 success = minDiff;
1261                         }
1262                         
1263                         if (alignment != NULL) {  delete alignment;  }
1264                         
1265                 }
1266                 
1267                 return success;
1268
1269         }
1270         catch(exception& e) {
1271                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1272                 exit(1);
1273         }
1274 }
1275
1276 //***************************************************************************************************************
1277
1278 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1279         try {
1280                 string rawSequence = seq.getUnaligned();
1281                 bool success = 0;       //guilty until proven innocent
1282                 
1283                 for(int i=0;i<numRPrimers;i++){
1284                         string oligo = revPrimer[i];
1285                         
1286                         if(rawSequence.length() < oligo.length()){
1287                                 success = 0;
1288                                 break;
1289                         }
1290                         
1291                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1292                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1293                                 if(qual.getName() != ""){
1294                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
1295                                 }
1296                                 success = 1;
1297                                 break;
1298                         }
1299                 }       
1300                 return success;
1301                 
1302         }
1303         catch(exception& e) {
1304                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1305                 exit(1);
1306         }
1307 }
1308
1309 //***************************************************************************************************************
1310
1311 bool TrimSeqsCommand::cullLength(Sequence& seq){
1312         try {
1313         
1314                 int length = seq.getNumBases();
1315                 bool success = 0;       //guilty until proven innocent
1316                 
1317                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
1318                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
1319                 else                                                                                            {       success = 0;    }
1320                 
1321                 return success;
1322         
1323         }
1324         catch(exception& e) {
1325                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1326                 exit(1);
1327         }
1328         
1329 }
1330
1331 //***************************************************************************************************************
1332
1333 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1334         try {
1335                 int longHomoP = seq.getLongHomoPolymer();
1336                 bool success = 0;       //guilty until proven innocent
1337                 
1338                 if(longHomoP <= maxHomoP){      success = 1;    }
1339                 else                                    {       success = 0;    }
1340                 
1341                 return success;
1342         }
1343         catch(exception& e) {
1344                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1345                 exit(1);
1346         }
1347         
1348 }
1349
1350 //***************************************************************************************************************
1351
1352 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1353         try {
1354                 int numNs = seq.getAmbigBases();
1355                 bool success = 0;       //guilty until proven innocent
1356                 
1357                 if(numNs <= maxAmbig)   {       success = 1;    }
1358                 else                                    {       success = 0;    }
1359                 
1360                 return success;
1361         }
1362         catch(exception& e) {
1363                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1364                 exit(1);
1365         }
1366         
1367 }
1368
1369 //***************************************************************************************************************
1370
1371 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1372         try {
1373                 bool success = 1;
1374                 int length = oligo.length();
1375                 
1376                 for(int i=0;i<length;i++){
1377                         
1378                         if(oligo[i] != seq[i]){
1379                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
1380                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
1381                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
1382                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
1383                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
1384                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
1385                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
1386                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
1387                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1388                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
1389                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
1390                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
1391                                 
1392                                 if(success == 0)        {       break;   }
1393                         }
1394                         else{
1395                                 success = 1;
1396                         }
1397                 }
1398                 
1399                 return success;
1400         }
1401         catch(exception& e) {
1402                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1403                 exit(1);
1404         }
1405
1406 }
1407 //***************************************************************************************************************
1408
1409 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1410         try {
1411
1412                 int length = oligo.length();
1413                 int countDiffs = 0;
1414                 
1415                 for(int i=0;i<length;i++){
1416                                                                 
1417                         if(oligo[i] != seq[i]){
1418                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
1419                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
1420                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
1421                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
1422                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
1423                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
1424                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
1425                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
1426                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1427                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
1428                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
1429                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
1430                         }
1431                         
1432                 }
1433                 
1434                 return countDiffs;
1435         }
1436         catch(exception& e) {
1437                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1438                 exit(1);
1439         }
1440
1441 }
1442 //***************************************************************************************************************
1443
1444 //bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
1445 //      try {
1446 //              
1447 //              string rawSequence = seq.getUnaligned();
1448 //              int seqLength = seq.getNumBases();
1449 //              bool success = 0;       //guilty until proven innocent
1450 //              string name;
1451 //              
1452 //              qFile >> name;
1453 //              if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1454 //              
1455 //              while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1456 //              
1457 //              int score;
1458 //              int end = seqLength;
1459 //              
1460 //              for(int i=0;i<seqLength;i++){
1461 //                      qFile >> score;
1462 //                      
1463 //                      if(score < qThreshold){
1464 //                              end = i;
1465 //                              break;
1466 //                      }
1467 //              }
1468 //              for(int i=end+1;i<seqLength;i++){
1469 //                      qFile >> score;
1470 //              }
1471 //              
1472 //              seq.setUnaligned(rawSequence.substr(0,end));
1473 //              
1474 //              return 1;
1475 //      }
1476 //      catch(exception& e) {
1477 //              m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1478 //              exit(1);
1479 //      }
1480 //}
1481
1482 //***************************************************************************************************************
1483
1484 //bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1485 //      try {
1486 //              string rawSequence = seq.getUnaligned();
1487 //              int seqLength = seq.getNumBases();
1488 //              bool success = 0;       //guilty until proven innocent
1489 //              string name;
1490 //              
1491 //              qFile >> name;
1492 //              if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1493 //              
1494 //              while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1495 //              
1496 //              float score;    
1497 //              float average = 0;
1498 //              
1499 //              for(int i=0;i<seqLength;i++){
1500 //                      qFile >> score;
1501 //                      average += score;
1502 //              }
1503 //              average /= seqLength;
1504 //
1505 //              if(average >= qAverage) {       success = 1;    }
1506 //              else                                    {       success = 0;    }
1507 //              
1508 //              return success;
1509 //      }
1510 //      catch(exception& e) {
1511 //              m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1512 //              exit(1);
1513 //      }
1514 //}
1515
1516 //***************************************************************************************************************