]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
b12da1cfe126bbdfc562172772093cf84e733457
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { temp = "0"; }
108                         convert(temp, tdiffs);
109                         
110                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
111                         convert(temp, bdiffs);
112                         
113                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
114                         convert(temp, pdiffs);
115                         
116                         temp = validParameter.validFile(parameters, "qfile", true);     
117                         if (temp == "not found")        {       qFileName = "";         }
118                         else if(temp == "not open")     {       abort = true;           }
119                         else                                            {       qFileName = temp;       }
120                         
121                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
122                         convert(temp, qThreshold);
123                         
124                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
125                         qtrim = isTrue(temp);
126
127                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
128                         convert(temp, qAverage);
129                         
130                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
131                         allFiles = isTrue(temp);
132                         
133                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
134                         convert(temp, processors); 
135                         
136                         if(allFiles && oligoFile == ""){
137                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
138                         }
139                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
140                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
141                                 qAverage=0;
142                                 qThreshold=0;
143                         }
144                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
145                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
146                                 abort = true;
147                         }
148                 }
149
150         }
151         catch(exception& e) {
152                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
153                 exit(1);
154         }
155 }
156 //**********************************************************************************************************************
157
158 void TrimSeqsCommand::help(){
159         try {
160                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
161                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
162                 m->mothurOut("The fasta parameter is required.\n");
163                 m->mothurOut("The flip parameter .... The default is 0.\n");
164                 m->mothurOut("The oligos parameter .... The default is "".\n");
165                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
166                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
167                 m->mothurOut("The minlength parameter .... The default is 0.\n");
168                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
169                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is 0.\n");
170                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
171                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
172                 m->mothurOut("The qfile parameter .....\n");
173                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
174                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
175                 m->mothurOut("The allfiles parameter .... The default is F.\n");
176                 m->mothurOut("The qtrim parameter .... The default is F.\n");
177                 m->mothurOut("The trim.seqs command should be in the following format: \n");
178                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
179                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
180                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
181                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
182                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
183
184         }
185         catch(exception& e) {
186                 m->errorOut(e, "TrimSeqsCommand", "help");
187                 exit(1);
188         }
189 }
190
191
192 //***************************************************************************************************************
193
194 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
195
196 //***************************************************************************************************************
197
198 int TrimSeqsCommand::execute(){
199         try{
200         
201                 if (abort == true) { return 0; }
202                 
203                 numFPrimers = 0;  //this needs to be initialized
204                 numRPrimers = 0;
205                 
206                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
207                 outputNames.push_back(trimSeqFile);
208                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
209                 outputNames.push_back(scrapSeqFile);
210                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
211                 
212                 vector<string> fastaFileNames;
213                 if(oligoFile != ""){
214                         outputNames.push_back(groupFile);
215                         getOligos(fastaFileNames);
216                 }
217                 
218                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
219
220                 
221                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
222                                 if(processors == 1){
223                                         ifstream inFASTA;
224                                         openInputFile(fastaFile, inFASTA);
225                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
226                                         inFASTA.close();
227                                         
228                                         lines.push_back(new linePair(0, numSeqs));
229                                         
230                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
231                                         
232                                         for (int j = 0; j < fastaFileNames.size(); j++) {
233                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
234                                         }
235
236                                 }else{
237                                         setLines(fastaFile, lines);     
238                                         if(qFileName == "")     {       qLines = lines; }       
239                                                                 
240                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
241                                         
242                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
243                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
244                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
245                                         for (int j = 0; j < fastaFileNames.size(); j++) {
246                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
247                                         }
248                                         //append files
249                                         for(int i=1;i<processors;i++){
250                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
251                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
252                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
253                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
254                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
255                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
256                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
257                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
258                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
259                                                 }
260                                         }
261                                 }
262                                 
263                                 if (m->control_pressed) {  return 0; }
264                 #else
265                                 ifstream inFASTA;
266                                 openInputFile(fastafileNames[s], inFASTA);
267                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
268                                 inFASTA.close();
269                                 
270                                 lines.push_back(new linePair(0, numSeqs));
271                                 
272                                 driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
273                                 
274                                 if (m->control_pressed) {  return 0; }
275                 #endif
276                                                 
277                                                                                 
278                 for(int i=0;i<fastaFileNames.size();i++){
279                         ifstream inFASTA;
280                         string seqName;
281                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
282                         ofstream outGroups;
283                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
284                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
285                         
286                         while(!inFASTA.eof()){
287                                 if(inFASTA.get() == '>'){
288                                         inFASTA >> seqName;
289                                         outGroups << seqName << '\t' << groupVector[i] << endl;
290                                 }
291                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
292                         }
293                         outGroups.close();
294                         inFASTA.close();
295                 }
296                 
297                 if (m->control_pressed) { 
298                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
299                         return 0;
300                 }
301
302                 m->mothurOutEndLine();
303                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
304                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
305                 m->mothurOutEndLine();
306                 
307                 return 0;       
308                         
309         }
310         catch(exception& e) {
311                 m->errorOut(e, "TrimSeqsCommand", "execute");
312                 exit(1);
313         }
314 }
315                 
316 /**************************************************************************************/
317 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
318         try {
319                 
320                 ofstream outFASTA;
321                 int able = openOutputFile(trimFile, outFASTA);
322                 
323                 ofstream scrapFASTA;
324                 openOutputFile(scrapFile, scrapFASTA);
325                 
326                 ofstream outGroups;
327                 vector<ofstream*> fastaFileNames;
328                 if (oligoFile != "") {          
329                         openOutputFile(groupFile, outGroups);   
330                         for (int i = 0; i < fastaNames.size(); i++) {
331                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
332                         }
333                 }
334                 
335                 ifstream inFASTA;
336                 openInputFile(filename, inFASTA);
337                 
338                 ifstream qFile;
339                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
340                 
341                 qFile.seekg(qline->start);
342                 inFASTA.seekg(line->start);
343                 
344                 for(int i=0;i<line->num;i++){
345                                 
346                         if (m->control_pressed) { 
347                                 inFASTA.close(); 
348                                 outFASTA.close();
349                                 scrapFASTA.close();
350                                 if (oligoFile != "") {   outGroups.close();   }
351                                 if(qFileName != "")     {       qFile.close();  }
352                                 for(int i=0;i<fastaFileNames.size();i++){
353                                         fastaFileNames[i]->close();
354                                         delete fastaFileNames[i];
355                                 }       
356                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
357                                 return 0;
358                         }
359                         
360                         bool success = 1;
361                         
362                         Sequence currSeq(inFASTA);
363
364                         string origSeq = currSeq.getUnaligned();
365                         if (origSeq != "") {
366                                 int group;
367                                 string trashCode = "";
368                                 int currentSeqsDiffs = 0;
369                                 
370                                 if(qFileName != ""){
371                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
372                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
373                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
374                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
375                                         }
376                                         if(!success)                    {       trashCode += 'q';                                                               }
377                                 }
378                         
379                                 if(barcodes.size() != 0){
380                                         success = stripBarcode(currSeq, group);
381                                         if(!success){   trashCode += 'b';       }
382                                         else{ currentSeqsDiffs += currentSeqsTdiffs;  }
383                                 }
384                         
385                                 if(numFPrimers != 0){
386                                         success = stripForward(currSeq);
387                                         if(!success){   trashCode += 'f';       }
388                                         else{ currentSeqsDiffs += currentSeqsTdiffs;  }
389                                 }
390                                 
391                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
392
393                                 if(numRPrimers != 0){
394                                         success = stripReverse(currSeq);
395                                         if(!success){   trashCode += 'r';       }
396                                 }
397                 
398                                 if(minLength > 0 || maxLength > 0){
399                                         success = cullLength(currSeq);
400                                         if(!success){   trashCode += 'l'; }
401                                 }
402                                 if(maxHomoP > 0){
403                                         success = cullHomoP(currSeq);
404                                         if(!success){   trashCode += 'h';       }
405                                 }
406                                 if(maxAmbig != -1){
407                                         success = cullAmbigs(currSeq);
408                                         if(!success){   trashCode += 'n';       }
409                                 }
410                                 
411                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
412                                 
413                                 if(trashCode.length() == 0){
414                                         currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
415                                         currSeq.printSequence(outFASTA);
416                                         if(barcodes.size() != 0){
417                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
418                                                 
419                                                 if(allFiles){
420                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
421                                                 }
422                                         }
423                                 }
424                                 else{
425                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
426                                         currSeq.setUnaligned(origSeq);
427                                         currSeq.printSequence(scrapFASTA);
428                                 }
429                         }
430                         gobble(inFASTA);
431                 }
432                 
433                 inFASTA.close();
434                 outFASTA.close();
435                 scrapFASTA.close();
436                 if (oligoFile != "") {   outGroups.close();   }
437                 if(qFileName != "")     {       qFile.close();  }
438                 
439                 for(int i=0;i<fastaFileNames.size();i++){
440                         fastaFileNames[i]->close();
441                         delete fastaFileNames[i];
442                 }               
443                 
444                 return 0;
445         }
446         catch(exception& e) {
447                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
448                 exit(1);
449         }
450 }
451 /**************************************************************************************************/
452 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
453         try {
454 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
455                 int process = 0;
456                 int exitCommand = 1;
457                 processIDS.clear();
458                 
459                 //loop through and create all the processes you want
460                 while (process != processors) {
461                         int pid = fork();
462                         
463                         if (pid > 0) {
464                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
465                                 process++;
466                         }else if (pid == 0){
467                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
468                                 exit(0);
469                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
470                 }
471                 
472                 //force parent to wait until all the processes are done
473                 for (int i=0;i<processors;i++) { 
474                         int temp = processIDS[i];
475                         wait(&temp);
476                 }
477                 
478                 return exitCommand;
479 #endif          
480         }
481         catch(exception& e) {
482                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
483                 exit(1);
484         }
485 }
486 /**************************************************************************************************/
487
488 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
489         try {
490                 
491                 lines.clear();
492                 
493                 vector<long int> positions;
494                 
495                 ifstream inFASTA;
496                 openInputFile(filename, inFASTA);
497                         
498                 string input;
499                 while(!inFASTA.eof()){  
500                         input = getline(inFASTA);
501
502                         if (input.length() != 0) {
503                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
504                         }
505                 }
506                 inFASTA.close();
507                 
508                 int numFastaSeqs = positions.size();
509         
510                 FILE * pFile;
511                 long size;
512                 
513                 //get num bytes in file
514                 pFile = fopen (filename.c_str(),"rb");
515                 if (pFile==NULL) perror ("Error opening file");
516                 else{
517                         fseek (pFile, 0, SEEK_END);
518                         size=ftell (pFile);
519                         fclose (pFile);
520                 }
521                 
522                 int numSeqsPerProcessor = numFastaSeqs / processors;
523                 
524                 for (int i = 0; i < processors; i++) {
525
526                         long int startPos = positions[ i * numSeqsPerProcessor ];
527                         if(i == processors - 1){
528                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
529                         }else{  
530                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
531                         }
532                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
533                 }
534                 
535                 return numFastaSeqs;
536         }
537         catch(exception& e) {
538                 m->errorOut(e, "TrimSeqsCommand", "setLines");
539                 exit(1);
540         }
541 }
542 //***************************************************************************************************************
543
544 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
545         try {
546                 ifstream inOligos;
547                 openInputFile(oligoFile, inOligos);
548                 
549                 ofstream test;
550                 
551                 string type, oligo, group;
552                 int index=0;
553                 
554                 while(!inOligos.eof()){
555                         inOligos >> type;
556                         
557                         if(type[0] == '#'){
558                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
559                         }
560                         else{
561                                 inOligos >> oligo;
562                                 
563                                 for(int i=0;i<oligo.length();i++){
564                                         oligo[i] = toupper(oligo[i]);
565                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
566                                 }
567                                 
568                                 if(type == "forward"){
569                                         forPrimer.push_back(oligo);
570                                 }
571                                 else if(type == "reverse"){
572                                         Sequence oligoRC("reverse", oligo);
573                                         oligoRC.reverseComplement();
574                                         revPrimer.push_back(oligoRC.getUnaligned());
575                                 }
576                                 else if(type == "barcode"){
577                                         inOligos >> group;
578                                         barcodes[oligo]=index++;
579                                         groupVector.push_back(group);
580                                         
581                                         if(allFiles){
582                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
583                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
584                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
585                                         }
586                                 }
587                         }
588                 }
589                 
590                 inOligos.close();
591                 
592                 numFPrimers = forPrimer.size();
593                 numRPrimers = revPrimer.size();
594                 
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
598                 exit(1);
599         }
600 }
601 //***************************************************************************************************************
602
603 bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
604         try {
605                 string rawSequence = seq.getUnaligned();
606                 bool success = 0;       //guilty until proven innocent
607                 
608                 //can you find the barcode
609                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
610                         string oligo = it->first;
611                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
612                                 success = 0;
613                                 break;
614                         }
615                         
616                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
617                                 group = it->second;
618                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
619                                 success = 1;
620                                 break;
621                         }
622                 }
623                 
624                 //if you found the barcode or if you don't want to allow for diffs
625                 if ((bdiffs == 0) || (success == 1)) { return success;  }
626                 
627                 else { //try aligning and see if you can find it
628                         
629                         Alignment* alignment;
630                         if (barcodes.size() > 0) { //assumes barcodes are all the same length
631                                 map<string,int>::iterator it=barcodes.begin();
632                                 string temp = it->first;
633                                 
634                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+bdiffs+1));  
635                         }else{ alignment = NULL; } 
636                         
637
638                         //can you find the barcode
639                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
640                                 string oligo = it->first;
641                                 int length = oligo.length();
642                                 
643                                 if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
644                                         success = 0;
645                                         break;
646                                 }
647                                 
648                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
649                                 alignment->align(oligo, rawSequence.substr(0,length+bdiffs));
650                                 oligo = alignment->getSeqAAln();
651                                 string temp = alignment->getSeqBAln();
652                 //cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,oligo.length()) << " raw aligned = " << temp << endl;                      
653                                 
654                                 int newStart=0;
655                                 if(compareDNASeq(oligo, temp, length, newStart, bdiffs)){
656                                         group = it->second;
657                                         seq.setUnaligned(rawSequence.substr(newStart));
658                                         success = 1;
659                                         break;
660                                 }
661                         }
662                         
663                         if (alignment != NULL) {  delete alignment;  }
664                 }
665                 return success;
666                 
667         }
668         catch(exception& e) {
669                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
670                 exit(1);
671         }
672
673 }
674
675 //***************************************************************************************************************
676
677 bool TrimSeqsCommand::stripForward(Sequence& seq){
678         try {
679                 string rawSequence = seq.getUnaligned();
680                 bool success = 0;       //guilty until proven innocent
681                 
682                 for(int i=0;i<numFPrimers;i++){
683                         string oligo = forPrimer[i];
684                         
685                         if(rawSequence.length() < oligo.length()){
686                                 success = 0;
687                                 break;
688                         }
689                         
690                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
691                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
692                                 success = 1;
693                                 break;
694                         }
695                 }
696                 
697                 //if you found the primer or if you don't want to allow for diffs
698                 if ((pdiffs == 0) || (success == 1)) { return success;  }
699                 
700                 else { //try aligning and see if you can find it
701                         
702                         Alignment* alignment;
703                         if (numFPrimers > 0) {  alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+pdiffs+1));  } 
704                         else{ alignment = NULL; } 
705                         //can you find the primer
706                         for(int i=0;i<numFPrimers;i++){
707                                 string oligo = forPrimer[i];
708                                 int length = oligo.length();
709                         
710                                 if(rawSequence.length() < oligo.length()){      
711                                         success = 0;
712                                         break;
713                                 }
714                         
715                                 //resize if neccessary
716                                 if ((length+pdiffs+1) > alignment->getnRows()) { alignment->resize(length+pdiffs+1);    }
717                                 
718                                 //use needleman to align first primer.length()+numdiffs of sequence to each primer
719                                 alignment->align(oligo, rawSequence.substr(0,length+pdiffs));
720                                 oligo = alignment->getSeqAAln();
721                                 string temp = alignment->getSeqBAln();
722                         
723                                 int newStart = 0;
724                                 if(compareDNASeq(oligo, temp, length, newStart, pdiffs)){
725                                         seq.setUnaligned(rawSequence.substr(newStart));
726                                         success = 1;
727                                         break;
728                                 }
729                         }
730                         
731                         if (alignment != NULL) {  delete alignment;  }
732                 }
733                 
734                 return success;
735                 
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
739                 exit(1);
740         }
741 }
742
743 //***************************************************************************************************************
744
745 bool TrimSeqsCommand::stripReverse(Sequence& seq){
746         try {
747                 string rawSequence = seq.getUnaligned();
748                 bool success = 0;       //guilty until proven innocent
749                 
750                 for(int i=0;i<numRPrimers;i++){
751                         string oligo = revPrimer[i];
752                         
753                         if(rawSequence.length() < oligo.length()){
754                                 success = 0;
755                                 break;
756                         }
757                         
758                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
759                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
760                                 success = 1;
761                                 break;
762                         }
763                 }       
764                 return success;
765                 
766         }
767         catch(exception& e) {
768                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
769                 exit(1);
770         }
771 }
772
773 //***************************************************************************************************************
774
775 bool TrimSeqsCommand::cullLength(Sequence& seq){
776         try {
777         
778                 int length = seq.getNumBases();
779                 bool success = 0;       //guilty until proven innocent
780                 
781                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
782                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
783                 else                                                                                            {       success = 0;    }
784                 
785                 return success;
786         
787         }
788         catch(exception& e) {
789                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
790                 exit(1);
791         }
792         
793 }
794
795 //***************************************************************************************************************
796
797 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
798         try {
799                 int longHomoP = seq.getLongHomoPolymer();
800                 bool success = 0;       //guilty until proven innocent
801                 
802                 if(longHomoP <= maxHomoP){      success = 1;    }
803                 else                                    {       success = 0;    }
804                 
805                 return success;
806         }
807         catch(exception& e) {
808                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
809                 exit(1);
810         }
811         
812 }
813
814 //***************************************************************************************************************
815
816 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
817         try {
818                 int numNs = seq.getAmbigBases();
819                 bool success = 0;       //guilty until proven innocent
820                 
821                 if(numNs <= maxAmbig)   {       success = 1;    }
822                 else                                    {       success = 0;    }
823                 
824                 return success;
825         }
826         catch(exception& e) {
827                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
828                 exit(1);
829         }
830         
831 }
832
833 //***************************************************************************************************************
834
835 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
836         try {
837                 bool success = 1;
838                 int length = oligo.length();
839                 
840                 for(int i=0;i<length;i++){
841                         
842                         if(oligo[i] != seq[i]){
843                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
844                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
845                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
846                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
847                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
848                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
849                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
850                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
851                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
852                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
853                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
854                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
855                                 
856                                 if(success == 0)        {       break;   }
857                         }
858                         else{
859                                 success = 1;
860                         }
861                 }
862                 
863                 return success;
864         }
865         catch(exception& e) {
866                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
867                 exit(1);
868         }
869
870 }
871 //***************************************************************************************************************
872
873 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end, int diffs){
874         try {
875                 bool success = 1;
876                 int length = oligo.length();
877                 end = numBases;
878                 int countBases = 0;
879                 int countDiffs = 0;
880                 
881                 if (length != 0) {
882                         if ((oligo[0] == '-') || (oligo[0] == '.')) {  success = 0;  return success;  } //no gaps allowed at beginning
883                 }
884                 
885                 for(int i=0;i<length;i++){
886                         
887                         if ((oligo[i] != '-') && (oligo[i] != '.'))  { countBases++; } 
888                                         
889                         if(oligo[i] != seq[i]){
890                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
891                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
892                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
893                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
894                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
895                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
896                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
897                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
898                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
899                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
900                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
901                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
902                                 
903                                 if(countDiffs > diffs)  {       success = 0; break;      }
904                         }
905                         else{
906                                 success = 1;
907                         }
908                         
909                         if (countBases >= numBases) { end = countBases; break; } //stop checking after end of barcode or primer
910                 }
911                 
912                 //if it's a success we want to check for total diffs in driver, so save it.
913                 if (success == 1) {  currentSeqsTdiffs = countDiffs; }
914                 
915                 return success;
916         }
917         catch(exception& e) {
918                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
919                 exit(1);
920         }
921
922 }
923 //***************************************************************************************************************
924
925 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
926         try {
927                 string rawSequence = seq.getUnaligned();
928                 int seqLength;  // = rawSequence.length();
929                 string name, temp, temp2;
930                 
931                 qFile >> name >> temp;
932         
933                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
934                 convert(temp, seqLength); //converts string to int
935         
936                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
937                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
938                 
939                 int score;
940                 int end = seqLength;
941                 
942                 for(int i=0;i<seqLength;i++){
943                         qFile >> score;
944                         
945                         if(score <= qThreshold){
946                                 end = i;
947                                 break;
948                         }
949                 }
950                 for(int i=end+1;i<seqLength;i++){
951                         qFile >> score;
952                 }
953                 
954                 seq.setUnaligned(rawSequence.substr(0,end));
955                 
956                 return 1;
957         }
958         catch(exception& e) {
959                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
960                 exit(1);
961         }
962 }
963
964 //***************************************************************************************************************
965
966 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
967         try {
968                 string rawSequence = seq.getUnaligned();
969                 int seqLength = seq.getNumBases();
970                 bool success = 0;       //guilty until proven innocent
971                 string name;
972                 
973                 qFile >> name;
974                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
975                 
976                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
977                 
978                 float score;    
979                 float average = 0;
980                 
981                 for(int i=0;i<seqLength;i++){
982                         qFile >> score;
983                         average += score;
984                 }
985                 average /= seqLength;
986
987                 if(average >= qAverage) {       success = 1;    }
988                 else                                    {       success = 0;    }
989                 
990                 return success;
991         }
992         catch(exception& e) {
993                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
994                 exit(1);
995         }
996 }
997
998 //***************************************************************************************************************