]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
working on trim.seqs
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         temp = validParameter.validFile(parameters, "diffs", false);            if (temp == "not found") { temp = "0"; }
108                         convert(temp, diffs);
109                         
110                         temp = validParameter.validFile(parameters, "qfile", true);     
111                         if (temp == "not found")        {       qFileName = "";         }
112                         else if(temp == "not open")     {       abort = true;           }
113                         else                                            {       qFileName = temp;       }
114                         
115                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
116                         convert(temp, qThreshold);
117                         
118                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
119                         qtrim = isTrue(temp);
120
121                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
122                         convert(temp, qAverage);
123                         
124                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
125                         allFiles = isTrue(temp);
126                         
127                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
128                         convert(temp, processors); 
129                         
130                         if(allFiles && oligoFile == ""){
131                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
132                         }
133                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
134                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
135                                 qAverage=0;
136                                 qThreshold=0;
137                         }
138                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
139                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
140                                 abort = true;
141                         }
142                 }
143
144         }
145         catch(exception& e) {
146                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
147                 exit(1);
148         }
149 }
150 //**********************************************************************************************************************
151
152 void TrimSeqsCommand::help(){
153         try {
154                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
155                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
156                 m->mothurOut("The fasta parameter is required.\n");
157                 m->mothurOut("The flip parameter .... The default is 0.\n");
158                 m->mothurOut("The oligos parameter .... The default is "".\n");
159                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
160                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
161                 m->mothurOut("The minlength parameter .... The default is 0.\n");
162                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
163                 m->mothurOut("The diffs parameter .... The default is 0.\n");
164                 m->mothurOut("The qfile parameter .....\n");
165                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
166                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
167                 m->mothurOut("The allfiles parameter .... The default is F.\n");
168                 m->mothurOut("The qtrim parameter .... The default is F.\n");
169                 m->mothurOut("The trim.seqs command should be in the following format: \n");
170                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
171                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
172                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
173                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
174                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
175
176         }
177         catch(exception& e) {
178                 m->errorOut(e, "TrimSeqsCommand", "help");
179                 exit(1);
180         }
181 }
182
183
184 //***************************************************************************************************************
185
186 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
187
188 //***************************************************************************************************************
189
190 int TrimSeqsCommand::execute(){
191         try{
192         
193                 if (abort == true) { return 0; }
194                 
195                 numFPrimers = 0;  //this needs to be initialized
196                 numRPrimers = 0;
197                 
198                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
199                 outputNames.push_back(trimSeqFile);
200                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
201                 outputNames.push_back(scrapSeqFile);
202                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
203                 
204                 vector<string> fastaFileNames;
205                 if(oligoFile != ""){
206                         outputNames.push_back(groupFile);
207                         getOligos(fastaFileNames);
208                 }
209                 
210                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
211
212                 
213                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
214                                 if(processors == 1){
215                                         ifstream inFASTA;
216                                         openInputFile(fastaFile, inFASTA);
217                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
218                                         inFASTA.close();
219                                         
220                                         lines.push_back(new linePair(0, numSeqs));
221                                         
222                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
223                                         
224                                         for (int j = 0; j < fastaFileNames.size(); j++) {
225                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
226                                         }
227
228                                 }else{
229                                         setLines(fastaFile, lines);     
230                                         if(qFileName == "")     {       qLines = lines; }       
231                                                                 
232                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
233                                         
234                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
235                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
236                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
237                                         for (int j = 0; j < fastaFileNames.size(); j++) {
238                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
239                                         }
240                                         //append files
241                                         for(int i=1;i<processors;i++){
242                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
243                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
244                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
245                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
246                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
247                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
248                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
249                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
250                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
251                                                 }
252                                         }
253                                 }
254                                 
255                                 if (m->control_pressed) {  return 0; }
256                 #else
257                                 ifstream inFASTA;
258                                 openInputFile(fastafileNames[s], inFASTA);
259                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
260                                 inFASTA.close();
261                                 
262                                 lines.push_back(new linePair(0, numSeqs));
263                                 
264                                 driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
265                                 
266                                 if (m->control_pressed) {  return 0; }
267                 #endif
268                                                 
269                                                                                 
270                 for(int i=0;i<fastaFileNames.size();i++){
271                         ifstream inFASTA;
272                         string seqName;
273                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
274                         ofstream outGroups;
275                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
276                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
277                         
278                         while(!inFASTA.eof()){
279                                 if(inFASTA.get() == '>'){
280                                         inFASTA >> seqName;
281                                         outGroups << seqName << '\t' << groupVector[i] << endl;
282                                 }
283                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
284                         }
285                         outGroups.close();
286                         inFASTA.close();
287                 }
288                 
289                 if (m->control_pressed) { 
290                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
291                         return 0;
292                 }
293
294                 m->mothurOutEndLine();
295                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
296                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
297                 m->mothurOutEndLine();
298                 
299                 return 0;       
300                         
301         }
302         catch(exception& e) {
303                 m->errorOut(e, "TrimSeqsCommand", "execute");
304                 exit(1);
305         }
306 }
307                 
308 /**************************************************************************************/
309 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
310         try {
311                 
312                 ofstream outFASTA;
313                 int able = openOutputFile(trimFile, outFASTA);
314                 
315                 ofstream scrapFASTA;
316                 openOutputFile(scrapFile, scrapFASTA);
317                 
318                 ofstream outGroups;
319                 vector<ofstream*> fastaFileNames;
320                 if (oligoFile != "") {          
321                         openOutputFile(groupFile, outGroups);   
322                         for (int i = 0; i < fastaNames.size(); i++) {
323                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
324                         }
325                 }
326                 
327                 ifstream inFASTA;
328                 openInputFile(filename, inFASTA);
329                 
330                 ifstream qFile;
331                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
332                 
333                 qFile.seekg(qline->start);
334                 inFASTA.seekg(line->start);
335                 
336                 for(int i=0;i<line->num;i++){
337                                 
338                         if (m->control_pressed) { 
339                                 inFASTA.close(); 
340                                 outFASTA.close();
341                                 scrapFASTA.close();
342                                 if (oligoFile != "") {   outGroups.close();   }
343                                 if(qFileName != "")     {       qFile.close();  }
344                                 for(int i=0;i<fastaFileNames.size();i++){
345                                         fastaFileNames[i]->close();
346                                         delete fastaFileNames[i];
347                                 }       
348                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
349                                 return 0;
350                         }
351                         
352                         bool success = 1;
353                         
354                         Sequence currSeq(inFASTA);
355
356                         string origSeq = currSeq.getUnaligned();
357                         if (origSeq != "") {
358                                 int group;
359                                 string trashCode = "";
360                                 
361                                 if(qFileName != ""){
362                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
363                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
364                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
365                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
366                                         }
367                                         if(!success)                    {       trashCode += 'q';                                                               }
368                                 }
369                         
370                                 if(barcodes.size() != 0){
371                                         success = stripBarcode(currSeq, group);
372                                         if(!success){   trashCode += 'b';       }
373                                 }
374                         
375                                 if(numFPrimers != 0){
376                                         success = stripForward(currSeq);
377                                         if(!success){   trashCode += 'f';       }
378                                 }
379                                         
380                                 if(numRPrimers != 0){
381                                         success = stripReverse(currSeq);
382                                         if(!success){   trashCode += 'r';       }
383                                 }
384                 
385                                 if(minLength > 0 || maxLength > 0){
386                                         success = cullLength(currSeq);
387                                         if(!success){   trashCode += 'l'; }
388                                 }
389                                 if(maxHomoP > 0){
390                                         success = cullHomoP(currSeq);
391                                         if(!success){   trashCode += 'h';       }
392                                 }
393                                 if(maxAmbig != -1){
394                                         success = cullAmbigs(currSeq);
395                                         if(!success){   trashCode += 'n';       }
396                                 }
397                                 
398                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
399                                 
400                                 if(trashCode.length() == 0){
401                                         currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
402                                         currSeq.printSequence(outFASTA);
403                                         if(barcodes.size() != 0){
404                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
405                                                 
406                                                 if(allFiles){
407                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
408                                                 }
409                                         }
410                                 }
411                                 else{
412                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
413                                         currSeq.setUnaligned(origSeq);
414                                         currSeq.printSequence(scrapFASTA);
415                                 }
416                         }
417                         gobble(inFASTA);
418                 }
419                 
420                 inFASTA.close();
421                 outFASTA.close();
422                 scrapFASTA.close();
423                 if (oligoFile != "") {   outGroups.close();   }
424                 if(qFileName != "")     {       qFile.close();  }
425                 
426                 for(int i=0;i<fastaFileNames.size();i++){
427                         fastaFileNames[i]->close();
428                         delete fastaFileNames[i];
429                 }               
430                 
431                 return 0;
432         }
433         catch(exception& e) {
434                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
435                 exit(1);
436         }
437 }
438 /**************************************************************************************************/
439 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
440         try {
441 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
442                 int process = 0;
443                 int exitCommand = 1;
444                 processIDS.clear();
445                 
446                 //loop through and create all the processes you want
447                 while (process != processors) {
448                         int pid = fork();
449                         
450                         if (pid > 0) {
451                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
452                                 process++;
453                         }else if (pid == 0){
454                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
455                                 exit(0);
456                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
457                 }
458                 
459                 //force parent to wait until all the processes are done
460                 for (int i=0;i<processors;i++) { 
461                         int temp = processIDS[i];
462                         wait(&temp);
463                 }
464                 
465                 return exitCommand;
466 #endif          
467         }
468         catch(exception& e) {
469                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
470                 exit(1);
471         }
472 }
473 /**************************************************************************************************/
474
475 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
476         try {
477                 
478                 lines.clear();
479                 
480                 vector<long int> positions;
481                 
482                 ifstream inFASTA;
483                 openInputFile(filename, inFASTA);
484                         
485                 string input;
486                 while(!inFASTA.eof()){  
487                         input = getline(inFASTA);
488
489                         if (input.length() != 0) {
490                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
491                         }
492                 }
493                 inFASTA.close();
494                 
495                 int numFastaSeqs = positions.size();
496         
497                 FILE * pFile;
498                 long size;
499                 
500                 //get num bytes in file
501                 pFile = fopen (filename.c_str(),"rb");
502                 if (pFile==NULL) perror ("Error opening file");
503                 else{
504                         fseek (pFile, 0, SEEK_END);
505                         size=ftell (pFile);
506                         fclose (pFile);
507                 }
508                 
509                 int numSeqsPerProcessor = numFastaSeqs / processors;
510                 
511                 for (int i = 0; i < processors; i++) {
512
513                         long int startPos = positions[ i * numSeqsPerProcessor ];
514                         if(i == processors - 1){
515                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
516                         }else{  
517                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
518                         }
519                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
520                 }
521                 
522                 return numFastaSeqs;
523         }
524         catch(exception& e) {
525                 m->errorOut(e, "TrimSeqsCommand", "setLines");
526                 exit(1);
527         }
528 }
529 //***************************************************************************************************************
530
531 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
532         try {
533                 ifstream inOligos;
534                 openInputFile(oligoFile, inOligos);
535                 
536                 ofstream test;
537                 
538                 string type, oligo, group;
539                 int index=0;
540                 
541                 while(!inOligos.eof()){
542                         inOligos >> type;
543                         
544                         if(type[0] == '#'){
545                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
546                         }
547                         else{
548                                 inOligos >> oligo;
549                                 
550                                 for(int i=0;i<oligo.length();i++){
551                                         oligo[i] = toupper(oligo[i]);
552                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
553                                 }
554                                 
555                                 if(type == "forward"){
556                                         forPrimer.push_back(oligo);
557                                 }
558                                 else if(type == "reverse"){
559                                         Sequence oligoRC("reverse", oligo);
560                                         oligoRC.reverseComplement();
561                                         revPrimer.push_back(oligoRC.getUnaligned());
562                                 }
563                                 else if(type == "barcode"){
564                                         inOligos >> group;
565                                         barcodes[oligo]=index++;
566                                         groupVector.push_back(group);
567                                         
568                                         if(allFiles){
569                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
570                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
571                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
572                                         }
573                                 }
574                         }
575                 }
576                 
577                 inOligos.close();
578                 
579                 numFPrimers = forPrimer.size();
580                 numRPrimers = revPrimer.size();
581                 
582         }
583         catch(exception& e) {
584                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
585                 exit(1);
586         }
587 }
588 //***************************************************************************************************************
589
590 bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
591         try {
592                 string rawSequence = seq.getUnaligned();
593                 bool success = 0;       //guilty until proven innocent
594                 
595                 //can you find the barcode
596                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
597                         string oligo = it->first;
598                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
599                                 success = 0;
600                                 break;
601                         }
602                         
603                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
604                                 group = it->second;
605                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
606                                 success = 1;
607                                 break;
608                         }
609                 }
610                 
611                 //if you found the barcode or if you don't want to allow for diffs
612                 if ((diffs == 0) || (success == 1)) { return success;  }
613                 
614                 else { //try aligning and see if you can find it
615                         
616                         Alignment* alignment;
617                         if (barcodes.size() > 0) { //assumes barcodes are all the same length
618                                 map<string,int>::iterator it=barcodes.begin();
619                                 string temp = it->first;
620                                 
621                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+diffs+1));  
622                         }else{ alignment = NULL; } 
623                         
624
625                         //can you find the barcode
626                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
627                                 string oligo = it->first;
628                                 int length = oligo.length();
629                                 
630                                 if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
631                                         success = 0;
632                                         break;
633                                 }
634                                 
635                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
636                                 alignment->align(oligo, rawSequence.substr(0,length+diffs));
637                                 oligo = alignment->getSeqAAln();
638                                 string temp = alignment->getSeqBAln();
639                 cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,oligo.length()) << " raw aligned = " << temp << endl;                        
640                                 
641                                 int newStart=0;
642                                 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()), length, newStart)){
643                 cout << "found match" << endl;
644                                         group = it->second;
645                                         seq.setUnaligned(rawSequence.substr(newStart));
646                                         success = 1;
647                                         break;
648                                 }
649                         }
650                         
651                         if (alignment != NULL) {  delete alignment;  }
652                 }
653                 return success;
654                 
655         }
656         catch(exception& e) {
657                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
658                 exit(1);
659         }
660
661 }
662
663 //***************************************************************************************************************
664
665 bool TrimSeqsCommand::stripForward(Sequence& seq){
666         try {
667                 string rawSequence = seq.getUnaligned();
668                 bool success = 0;       //guilty until proven innocent
669                 
670                 for(int i=0;i<numFPrimers;i++){
671                         string oligo = forPrimer[i];
672                         
673                         if(rawSequence.length() < oligo.length()){
674                                 success = 0;
675                                 break;
676                         }
677                         
678                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
679                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
680                                 success = 1;
681                                 break;
682                         }
683                 }
684                 
685                 //if you found the primer or if you don't want to allow for diffs
686                 if ((diffs == 0) || (success == 1)) { return success;  }
687                 
688                 else { //try aligning and see if you can find it
689                         
690                         
691                         Alignment* alignment;
692                         if (numFPrimers > 0) { alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+diffs+1));  } //assumes primers are all the same length
693                         else{ alignment = NULL; } 
694                         
695                         //can you find the primer
696                         for(int i=0;i<numFPrimers;i++){
697                                 string oligo = forPrimer[i];
698                                 int length = oligo.length();
699                                 
700                                 if(rawSequence.length() < oligo.length()){      
701                                         success = 0;
702                                         break;
703                                 }
704                                 
705                                 //use needleman to align first primer.length()+numdiffs of sequence to each primer
706                                 alignment->align(oligo, rawSequence.substr(0,length+diffs));
707                                 oligo = alignment->getSeqAAln();
708                                 
709                                 int newStart = 0;
710                                 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()), length, newStart)){
711                                         seq.setUnaligned(rawSequence.substr(newStart));
712                                         success = 1;
713                                         break;
714                                 }
715                         }
716                         
717                         if (alignment != NULL) {  delete alignment;  }
718                 }
719                 
720                 return success;
721                 
722         }
723         catch(exception& e) {
724                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
725                 exit(1);
726         }
727 }
728
729 //***************************************************************************************************************
730
731 bool TrimSeqsCommand::stripReverse(Sequence& seq){
732         try {
733                 string rawSequence = seq.getUnaligned();
734                 bool success = 0;       //guilty until proven innocent
735                 
736                 for(int i=0;i<numRPrimers;i++){
737                         string oligo = revPrimer[i];
738                         
739                         if(rawSequence.length() < oligo.length()){
740                                 success = 0;
741                                 break;
742                         }
743                         
744                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
745                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
746                                 success = 1;
747                                 break;
748                         }
749                 }       
750                 return success;
751                 
752         }
753         catch(exception& e) {
754                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
755                 exit(1);
756         }
757 }
758
759 //***************************************************************************************************************
760
761 bool TrimSeqsCommand::cullLength(Sequence& seq){
762         try {
763         
764                 int length = seq.getNumBases();
765                 bool success = 0;       //guilty until proven innocent
766                 
767                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
768                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
769                 else                                                                                            {       success = 0;    }
770                 
771                 return success;
772         
773         }
774         catch(exception& e) {
775                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
776                 exit(1);
777         }
778         
779 }
780
781 //***************************************************************************************************************
782
783 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
784         try {
785                 int longHomoP = seq.getLongHomoPolymer();
786                 bool success = 0;       //guilty until proven innocent
787                 
788                 if(longHomoP <= maxHomoP){      success = 1;    }
789                 else                                    {       success = 0;    }
790                 
791                 return success;
792         }
793         catch(exception& e) {
794                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
795                 exit(1);
796         }
797         
798 }
799
800 //***************************************************************************************************************
801
802 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
803         try {
804                 int numNs = seq.getAmbigBases();
805                 bool success = 0;       //guilty until proven innocent
806                 
807                 if(numNs <= maxAmbig)   {       success = 1;    }
808                 else                                    {       success = 0;    }
809                 
810                 return success;
811         }
812         catch(exception& e) {
813                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
814                 exit(1);
815         }
816         
817 }
818
819 //***************************************************************************************************************
820
821 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
822         try {
823                 bool success = 1;
824                 int length = oligo.length();
825                 
826                 for(int i=0;i<length;i++){
827                         
828                         if(oligo[i] != seq[i]){
829                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
830                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
831                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
832                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
833                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
834                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
835                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
836                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
837                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
838                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
839                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
840                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
841                                 
842                                 if(success == 0)        {       break;   }
843                         }
844                         else{
845                                 success = 1;
846                         }
847                 }
848                 
849                 return success;
850         }
851         catch(exception& e) {
852                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
853                 exit(1);
854         }
855
856 }
857 //***************************************************************************************************************
858
859 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end){
860         try {
861                 bool success = 1;
862                 int length = oligo.length();
863                 end = length;
864                 int countBases = 0;
865                 int countDiffs = 0;
866                 
867                 if (length != 0) {
868                         if ((oligo[0] == '-') || (oligo[0] == '.')) {  success = 0; return success;  } //no gaps allowed at beginning
869                 }
870                 
871                 for(int i=0;i<length;i++){
872                         
873                         if ((oligo[i] != '-') && (oligo[i] != '.'))  { countBases++; } 
874                         
875         //cout << oligo[i] << " " << seq[i] << " diffs = " << countDiffs << " countBases = " << countBases << endl;     
876                 
877                         if(oligo[i] != seq[i]){
878                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
879                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
880                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
881                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
882                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
883                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
884                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
885                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
886                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
887                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
888                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
889                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
890                                 
891                                 if(countDiffs > diffs)  {       success = 0; break;      }
892                         }
893                         else{
894                                 success = 1;
895                         }
896                         
897                         if (countBases >= numBases) { end = i; break; } //stop checking after end of barcode or primer
898                 }
899         
900                 return success;
901         }
902         catch(exception& e) {
903                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
904                 exit(1);
905         }
906
907 }
908 //***************************************************************************************************************
909
910 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
911         try {
912                 string rawSequence = seq.getUnaligned();
913                 int seqLength;  // = rawSequence.length();
914                 string name, temp, temp2;
915                 
916                 qFile >> name >> temp;
917         
918                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
919                 convert(temp, seqLength); //converts string to int
920         
921                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
922                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
923                 
924                 int score;
925                 int end = seqLength;
926                 
927                 for(int i=0;i<seqLength;i++){
928                         qFile >> score;
929                         
930                         if(score <= qThreshold){
931                                 end = i;
932                                 break;
933                         }
934                 }
935                 for(int i=end+1;i<seqLength;i++){
936                         qFile >> score;
937                 }
938                 
939                 seq.setUnaligned(rawSequence.substr(0,end));
940                 
941                 return 1;
942         }
943         catch(exception& e) {
944                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
945                 exit(1);
946         }
947 }
948
949 //***************************************************************************************************************
950
951 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
952         try {
953                 string rawSequence = seq.getUnaligned();
954                 int seqLength = seq.getNumBases();
955                 bool success = 0;       //guilty until proven innocent
956                 string name;
957                 
958                 qFile >> name;
959                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
960                 
961                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
962                 
963                 float score;    
964                 float average = 0;
965                 
966                 for(int i=0;i<seqLength;i++){
967                         qFile >> score;
968                         average += score;
969                 }
970                 average /= seqLength;
971
972                 if(average >= qAverage) {       success = 1;    }
973                 else                                    {       success = 0;    }
974                 
975                 return success;
976         }
977         catch(exception& e) {
978                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
979                 exit(1);
980         }
981 }
982
983 //***************************************************************************************************************