]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
fixed project
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter .... The default is 0.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         openInputFile(fastaFile, inFASTA);
228                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
229                                         inFASTA.close();
230                                         
231                                         lines.push_back(new linePair(0, numSeqs));
232                                         
233                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
234                                         
235                                         for (int j = 0; j < fastaFileNames.size(); j++) {
236                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
237                                         }
238
239                                 }else{
240                                         setLines(fastaFile, lines);     
241                                         if(qFileName == "")     {       qLines = lines; }       
242                                                                 
243                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
244                                         
245                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
246                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
247                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
248                                         for (int j = 0; j < fastaFileNames.size(); j++) {
249                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
250                                         }
251                                         //append files
252                                         for(int i=1;i<processors;i++){
253                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
254                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
255                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
256                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
257                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
258                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
259                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
260                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
261                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
262                                                 }
263                                         }
264                                 }
265                                 
266                                 if (m->control_pressed) {  return 0; }
267                 #else
268                                 ifstream inFASTA;
269                                 openInputFile(fastaFile, inFASTA);
270                                 int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
271                                 inFASTA.close();
272                                 
273                                 lines.push_back(new linePair(0, numSeqs));
274                                 
275                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
276                                 
277                                 if (m->control_pressed) {  return 0; }
278                 #endif
279                                                 
280                                                                                 
281                 for(int i=0;i<fastaFileNames.size();i++){
282                         ifstream inFASTA;
283                         string seqName;
284                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
285                         ofstream outGroups;
286                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
287                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
288                         
289                         while(!inFASTA.eof()){
290                                 if(inFASTA.get() == '>'){
291                                         inFASTA >> seqName;
292                                         outGroups << seqName << '\t' << groupVector[i] << endl;
293                                 }
294                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
295                         }
296                         outGroups.close();
297                         inFASTA.close();
298                 }
299                 
300                 if (m->control_pressed) { 
301                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
302                         return 0;
303                 }
304
305                 m->mothurOutEndLine();
306                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
307                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
308                 m->mothurOutEndLine();
309                 
310                 return 0;       
311                         
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "TrimSeqsCommand", "execute");
315                 exit(1);
316         }
317 }
318                 
319 /**************************************************************************************/
320 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
321         try {
322                 
323                 ofstream outFASTA;
324                 int able = openOutputFile(trimFile, outFASTA);
325                 
326                 ofstream scrapFASTA;
327                 openOutputFile(scrapFile, scrapFASTA);
328                 
329                 ofstream outGroups;
330                 vector<ofstream*> fastaFileNames;
331                 if (oligoFile != "") {          
332                         openOutputFile(groupFile, outGroups);   
333                         for (int i = 0; i < fastaNames.size(); i++) {
334                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
335                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
336                         #else
337                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate));                      
338                         #endif
339                         }
340                 }
341                 
342                 ifstream inFASTA;
343                 openInputFile(filename, inFASTA);
344                 
345                 ifstream qFile;
346                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
347                 
348                 qFile.seekg(qline->start);
349                 inFASTA.seekg(line->start);
350                 
351                 for(int i=0;i<line->num;i++){
352                                 
353                         if (m->control_pressed) { 
354                                 inFASTA.close(); 
355                                 outFASTA.close();
356                                 scrapFASTA.close();
357                                 if (oligoFile != "") {   outGroups.close();   }
358                                 if(qFileName != "")     {       qFile.close();  }
359                                 for(int i=0;i<fastaFileNames.size();i++){
360                                         fastaFileNames[i]->close();
361                                         delete fastaFileNames[i];
362                                 }       
363                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
364                                 return 0;
365                         }
366                         
367                         int success = 1;
368                         
369                         Sequence currSeq(inFASTA);
370
371                         string origSeq = currSeq.getUnaligned();
372                         if (origSeq != "") {
373                                 int group;
374                                 string trashCode = "";
375                                 int currentSeqsDiffs = 0;
376                                 
377                                 if(qFileName != ""){
378                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
379                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
380                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
381                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
382                                         }
383                                         if(!success)                    {       trashCode += 'q';                                                               }
384                                 }
385                         
386                                 if(barcodes.size() != 0){
387                                         success = stripBarcode(currSeq, group);
388                                         if(success > bdiffs){   trashCode += 'b';       }
389                                         else{ currentSeqsDiffs += success;  }
390                                 }
391
392                                 if(numFPrimers != 0){
393                                         success = stripForward(currSeq);
394                                         if(success > pdiffs){   trashCode += 'f';       }
395                                         else{ currentSeqsDiffs += success;  }
396                                 }
397                                 
398                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
399
400                                 if(numRPrimers != 0){
401                                         success = stripReverse(currSeq);
402                                         if(!success){   trashCode += 'r';       }
403                                 }
404                 
405                                 if(minLength > 0 || maxLength > 0){
406                                         success = cullLength(currSeq);
407                                         if(!success){   trashCode += 'l'; }
408                                 }
409                                 if(maxHomoP > 0){
410                                         success = cullHomoP(currSeq);
411                                         if(!success){   trashCode += 'h';       }
412                                 }
413                                 if(maxAmbig != -1){
414                                         success = cullAmbigs(currSeq);
415                                         if(!success){   trashCode += 'n';       }
416                                 }
417                                 
418                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
419                                 
420                                 if(trashCode.length() == 0){
421                                         currSeq.setAligned(currSeq.getUnaligned());
422                                         currSeq.printSequence(outFASTA);
423                                         if(barcodes.size() != 0){
424                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
425                                                 
426                                                 if(allFiles){
427                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
428                                                 }
429                                         }
430                                 }
431                                 else{
432                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
433                                         currSeq.setUnaligned(origSeq);
434                                         currSeq.setAligned(origSeq);
435                                         currSeq.printSequence(scrapFASTA);
436                                 }
437                         }
438                         gobble(inFASTA);
439                 }
440                 
441                 inFASTA.close();
442                 outFASTA.close();
443                 scrapFASTA.close();
444                 if (oligoFile != "") {   outGroups.close();   }
445                 if(qFileName != "")     {       qFile.close();  }
446                 
447                 for(int i=0;i<fastaFileNames.size();i++){
448                         fastaFileNames[i]->close();
449                         delete fastaFileNames[i];
450                 }               
451                 
452                 return 0;
453         }
454         catch(exception& e) {
455                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
456                 exit(1);
457         }
458 }
459 /**************************************************************************************************/
460 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
461         try {
462 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
463                 int process = 0;
464                 int exitCommand = 1;
465                 processIDS.clear();
466                 
467                 //loop through and create all the processes you want
468                 while (process != processors) {
469                         int pid = fork();
470                         
471                         if (pid > 0) {
472                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
473                                 process++;
474                         }else if (pid == 0){
475                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
476                                 exit(0);
477                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
478                 }
479                 
480                 //force parent to wait until all the processes are done
481                 for (int i=0;i<processors;i++) { 
482                         int temp = processIDS[i];
483                         wait(&temp);
484                 }
485                 
486                 return exitCommand;
487 #endif          
488         }
489         catch(exception& e) {
490                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
491                 exit(1);
492         }
493 }
494 /**************************************************************************************************/
495
496 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
497         try {
498                 
499                 lines.clear();
500                 
501                 vector<long int> positions;
502                 
503                 ifstream inFASTA;
504                 openInputFile(filename, inFASTA);
505                         
506                 string input;
507                 while(!inFASTA.eof()){  
508                         input = getline(inFASTA);
509
510                         if (input.length() != 0) {
511                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
512                         }
513                 }
514                 inFASTA.close();
515                 
516                 int numFastaSeqs = positions.size();
517         
518                 FILE * pFile;
519                 long size;
520                 
521                 //get num bytes in file
522                 pFile = fopen (filename.c_str(),"rb");
523                 if (pFile==NULL) perror ("Error opening file");
524                 else{
525                         fseek (pFile, 0, SEEK_END);
526                         size=ftell (pFile);
527                         fclose (pFile);
528                 }
529                 
530                 int numSeqsPerProcessor = numFastaSeqs / processors;
531                 
532                 for (int i = 0; i < processors; i++) {
533
534                         long int startPos = positions[ i * numSeqsPerProcessor ];
535                         if(i == processors - 1){
536                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
537                         }else{  
538                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
539                         }
540                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
541                 }
542                 
543                 return numFastaSeqs;
544         }
545         catch(exception& e) {
546                 m->errorOut(e, "TrimSeqsCommand", "setLines");
547                 exit(1);
548         }
549 }
550 //***************************************************************************************************************
551
552 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
553         try {
554                 ifstream inOligos;
555                 openInputFile(oligoFile, inOligos);
556                 
557                 ofstream test;
558                 
559                 string type, oligo, group;
560                 int index=0;
561                 
562                 while(!inOligos.eof()){
563                         inOligos >> type;
564                         
565                         if(type[0] == '#'){
566                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
567                         }
568                         else{
569                                 inOligos >> oligo;
570                                 
571                                 for(int i=0;i<oligo.length();i++){
572                                         oligo[i] = toupper(oligo[i]);
573                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
574                                 }
575                                 
576                                 if(type == "forward"){
577                                         forPrimer.push_back(oligo);
578                                 }
579                                 else if(type == "reverse"){
580                                         Sequence oligoRC("reverse", oligo);
581                                         oligoRC.reverseComplement();
582                                         revPrimer.push_back(oligoRC.getUnaligned());
583                                 }
584                                 else if(type == "barcode"){
585                                         inOligos >> group;
586                                         barcodes[oligo]=index++;
587                                         groupVector.push_back(group);
588                                         
589                                         if(allFiles){
590                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
591                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
592                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
593                                         }
594                                 }
595                         }
596                 }
597                 
598                 inOligos.close();
599                 
600                 numFPrimers = forPrimer.size();
601                 numRPrimers = revPrimer.size();
602                 
603         }
604         catch(exception& e) {
605                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
606                 exit(1);
607         }
608 }
609 //***************************************************************************************************************
610
611 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
612         try {
613                 
614                 string rawSequence = seq.getUnaligned();
615                 int success = bdiffs + 1;       //guilty until proven innocent
616                 
617                 //can you find the barcode
618                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
619                         string oligo = it->first;
620                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
621                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
622                                 break;  
623                         }
624                         
625                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
626                                 group = it->second;
627                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
628                                 success = 0;
629                                 break;
630                         }
631                 }
632                 
633                 //if you found the barcode or if you don't want to allow for diffs
634 //              cout << success;
635                 if ((bdiffs == 0) || (success == 0)) { return success;  }
636                 
637                 else { //try aligning and see if you can find it
638 //                      cout << endl;
639
640                         int maxLength = 0;
641
642                         Alignment* alignment;
643                         if (barcodes.size() > 0) {
644                                 map<string,int>::iterator it=barcodes.begin();
645
646                                 for(it;it!=barcodes.end();it++){
647                                         if(it->first.length() > maxLength){
648                                                 maxLength = it->first.length();
649                                         }
650                                 }
651                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
652
653                         }else{ alignment = NULL; } 
654                         
655                         //can you find the barcode
656                         int minDiff = 1e6;
657                         int minCount = 1;
658                         int minGroup = -1;
659                         int minPos = 0;
660                         
661                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
662                                 string oligo = it->first;
663 //                              int length = oligo.length();
664                                 
665                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
666                                         success = bdiffs + 10;
667                                         break;
668                                 }
669                                 
670                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
671                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
672                                 oligo = alignment->getSeqAAln();
673                                 string temp = alignment->getSeqBAln();
674                 
675                                 int alnLength = oligo.length();
676                                 
677                                 for(int i=oligo.length()-1;i>=0;i--){
678                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
679                                 }
680                                 oligo = oligo.substr(0,alnLength);
681                                 temp = temp.substr(0,alnLength);
682                                 
683                                 int newStart=0;
684                                 int numDiff = countDiffs(oligo, temp);
685                                 if(numDiff < minDiff){
686                                         minDiff = numDiff;
687                                         minCount = 1;
688                                         minGroup = it->second;
689                                         minPos = 0;
690                                         for(int i=0;i<alnLength;i++){
691                                                 if(temp[i] != '-'){
692                                                         minPos++;
693                                                 }
694                                         }
695                                 }
696                                 else if(numDiff == minDiff){
697                                         minCount++;
698                                 }
699
700                         }
701
702                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
703                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
704                         else{                                                                                                   //use the best match
705                                 group = minGroup;
706                                 seq.setUnaligned(rawSequence.substr(minPos));
707                                 success = minDiff;
708                         }
709                         
710                         if (alignment != NULL) {  delete alignment;  }
711                         
712                 }
713 //              cout << success << endl;
714                 
715                 return success;
716                 
717         }
718         catch(exception& e) {
719                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
720                 exit(1);
721         }
722
723 }
724
725 //***************************************************************************************************************
726
727 int TrimSeqsCommand::stripForward(Sequence& seq){
728         try {
729                 string rawSequence = seq.getUnaligned();
730                 int success = pdiffs + 1;       //guilty until proven innocent
731                 
732                 //can you find the primer
733                 for(int i=0;i<numFPrimers;i++){
734                         string oligo = forPrimer[i];
735
736                         if(rawSequence.length() < oligo.length()){      
737                                 success = pdiffs + 1;
738                                 break;
739                         }
740                         
741                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
742                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
743                                 success = 0;
744                                 break;
745                         }
746                 }
747                 
748                 //if you found the barcode or if you don't want to allow for diffs
749 //              cout << success;
750                 if ((pdiffs == 0) || (success == 0)) { return success;  }
751                 
752                 else { //try aligning and see if you can find it
753 //                      cout << endl;
754
755                         int maxLength = 0;
756
757                         Alignment* alignment;
758                         if (numFPrimers > 0) {
759
760                                 for(int i=0;i<numFPrimers;i++){
761                                         if(forPrimer[i].length() > maxLength){
762                                                 maxLength = forPrimer[i].length();
763                                         }
764                                 }
765                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1));  
766
767                         }else{ alignment = NULL; } 
768                         
769                         //can you find the barcode
770                         int minDiff = 1e6;
771                         int minCount = 1;
772                         int minPos = 0;
773                         
774                         for(int i=0;i<numFPrimers;i++){
775                                 string oligo = forPrimer[i];
776                                 
777                                 if(rawSequence.length() < maxLength){   
778                                         success = pdiffs + 100;
779                                         break;
780                                 }
781                                 
782                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
783                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
784                                 oligo = alignment->getSeqAAln();
785                                 string temp = alignment->getSeqBAln();
786                 
787                                 int alnLength = oligo.length();
788                                 
789                                 for(int i=oligo.length()-1;i>=0;i--){
790                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
791                                 }
792                                 oligo = oligo.substr(0,alnLength);
793                                 temp = temp.substr(0,alnLength);
794
795                                 int newStart=0;
796                                 int numDiff = countDiffs(oligo, temp);
797                                 if(numDiff < minDiff){
798                                         minDiff = numDiff;
799                                         minCount = 1;
800                                         minPos = 0;
801                                         for(int i=0;i<alnLength;i++){
802                                                 if(temp[i] != '-'){
803                                                         minPos++;
804                                                 }
805                                         }
806                                 }
807                                 else if(numDiff == minDiff){
808                                         minCount++;
809                                 }
810
811                         }
812                         if(minDiff > pdiffs)    {       success =  minDiff;             }
813                         else if(minCount > 1)   {       success =  pdiffs + 10; }
814                         else{
815                                 seq.setUnaligned(rawSequence.substr(minPos));
816                                 success = minDiff;
817                         }
818                         
819                         if (alignment != NULL) {  delete alignment;  }
820                         
821                 }
822                 return success;
823
824         }
825         catch(exception& e) {
826                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
827                 exit(1);
828         }
829 }
830
831 //***************************************************************************************************************
832
833 bool TrimSeqsCommand::stripReverse(Sequence& seq){
834         try {
835                 string rawSequence = seq.getUnaligned();
836                 bool success = 0;       //guilty until proven innocent
837                 
838                 for(int i=0;i<numRPrimers;i++){
839                         string oligo = revPrimer[i];
840                         
841                         if(rawSequence.length() < oligo.length()){
842                                 success = 0;
843                                 break;
844                         }
845                         
846                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
847                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
848                                 success = 1;
849                                 break;
850                         }
851                 }       
852                 return success;
853                 
854         }
855         catch(exception& e) {
856                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
857                 exit(1);
858         }
859 }
860
861 //***************************************************************************************************************
862
863 bool TrimSeqsCommand::cullLength(Sequence& seq){
864         try {
865         
866                 int length = seq.getNumBases();
867                 bool success = 0;       //guilty until proven innocent
868                 
869                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
870                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
871                 else                                                                                            {       success = 0;    }
872                 
873                 return success;
874         
875         }
876         catch(exception& e) {
877                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
878                 exit(1);
879         }
880         
881 }
882
883 //***************************************************************************************************************
884
885 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
886         try {
887                 int longHomoP = seq.getLongHomoPolymer();
888                 bool success = 0;       //guilty until proven innocent
889                 
890                 if(longHomoP <= maxHomoP){      success = 1;    }
891                 else                                    {       success = 0;    }
892                 
893                 return success;
894         }
895         catch(exception& e) {
896                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
897                 exit(1);
898         }
899         
900 }
901
902 //***************************************************************************************************************
903
904 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
905         try {
906                 int numNs = seq.getAmbigBases();
907                 bool success = 0;       //guilty until proven innocent
908                 
909                 if(numNs <= maxAmbig)   {       success = 1;    }
910                 else                                    {       success = 0;    }
911                 
912                 return success;
913         }
914         catch(exception& e) {
915                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
916                 exit(1);
917         }
918         
919 }
920
921 //***************************************************************************************************************
922
923 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
924         try {
925                 bool success = 1;
926                 int length = oligo.length();
927                 
928                 for(int i=0;i<length;i++){
929                         
930                         if(oligo[i] != seq[i]){
931                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
932                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
933                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
934                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
935                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
936                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
937                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
938                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
939                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
940                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
941                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
942                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
943                                 
944                                 if(success == 0)        {       break;   }
945                         }
946                         else{
947                                 success = 1;
948                         }
949                 }
950                 
951                 return success;
952         }
953         catch(exception& e) {
954                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
955                 exit(1);
956         }
957
958 }
959 //***************************************************************************************************************
960
961 int TrimSeqsCommand::countDiffs(string oligo, string seq){
962         try {
963
964                 int length = oligo.length();
965                 int countDiffs = 0;
966                 
967                 for(int i=0;i<length;i++){
968                                                                 
969                         if(oligo[i] != seq[i]){
970                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
971                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
972                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
973                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
974                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
975                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
976                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
977                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
978                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
979                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
980                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
981                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
982                                 
983                         }
984                 }
985                 
986                 return countDiffs;
987         }
988         catch(exception& e) {
989                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
990                 exit(1);
991         }
992
993 }
994 //***************************************************************************************************************
995
996 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
997         try {
998                 string rawSequence = seq.getUnaligned();
999                 int seqLength;  // = rawSequence.length();
1000                 string name, temp, temp2;
1001                 
1002                 qFile >> name;
1003                 
1004                 //get rest of line
1005                 temp = "";
1006                 while (!qFile.eof())    {       
1007                         char c = qFile.get(); 
1008                         if (c == 10 || c == 13){        break;  }       
1009                         else { temp += c; }
1010                 } 
1011         
1012                 int pos = temp.find("length");
1013                 if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine();  seqLength = 0;  }
1014                 else {
1015                         string tempLength = temp.substr(pos);
1016                         istringstream iss (tempLength,istringstream::in);
1017                         iss >> temp;
1018                 }
1019                 
1020                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1021                 convert(temp, seqLength); //converts string to int
1022         
1023                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1024                 
1025                 int score;
1026                 int end = seqLength;
1027                 
1028                 for(int i=0;i<seqLength;i++){
1029                         qFile >> score;
1030                         
1031                         if(score <= qThreshold){
1032                                 end = i;
1033                                 break;
1034                         }
1035                 }
1036                 for(int i=end+1;i<seqLength;i++){
1037                         qFile >> score;
1038                 }
1039                 
1040                 seq.setUnaligned(rawSequence.substr(0,end));
1041                 
1042                 return 1;
1043         }
1044         catch(exception& e) {
1045                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1046                 exit(1);
1047         }
1048 }
1049
1050 //***************************************************************************************************************
1051
1052 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1053         try {
1054                 string rawSequence = seq.getUnaligned();
1055                 int seqLength = seq.getNumBases();
1056                 bool success = 0;       //guilty until proven innocent
1057                 string name;
1058                 
1059                 qFile >> name;
1060                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1061                 
1062                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1063                 
1064                 float score;    
1065                 float average = 0;
1066                 
1067                 for(int i=0;i<seqLength;i++){
1068                         qFile >> score;
1069                         average += score;
1070                 }
1071                 average /= seqLength;
1072
1073                 if(average >= qAverage) {       success = 1;    }
1074                 else                                    {       success = 0;    }
1075                 
1076                 return success;
1077         }
1078         catch(exception& e) {
1079                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1080                 exit(1);
1081         }
1082 }
1083
1084 //***************************************************************************************************************