]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
made a small mod to the alignment parameters in trim.seqs
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         int numSeqs;
228                                         openInputFile(fastaFile, inFASTA);
229                                         getNumSeqs(inFASTA, numSeqs);
230                                         inFASTA.close();
231                                         
232                                         lines.push_back(new linePair(0, numSeqs));
233                                         
234                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
235                                         
236                                         for (int j = 0; j < fastaFileNames.size(); j++) {
237                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
238                                         }
239
240                                 }else{
241                                         setLines(fastaFile, lines);     
242                                         if(qFileName == "")     {       qLines = lines; }       
243                                                                 
244                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
245                                         
246                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
247                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
248                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
249                                         for (int j = 0; j < fastaFileNames.size(); j++) {
250                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
251                                         }
252                                         //append files
253                                         for(int i=1;i<processors;i++){
254                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
255                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
256                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
257                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
258                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
259                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
260                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
261                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
262                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
263                                                 }
264                                         }
265                                 }
266                                 
267                                 if (m->control_pressed) {  return 0; }
268                 #else
269                                 ifstream inFASTA;
270                                 int numSeqs;
271                                 openInputFile(fastaFile, inFASTA);
272                                 getNumSeqs(inFASTA, numSeqs);
273                                 inFASTA.close();
274                                 
275                                 lines.push_back(new linePair(0, numSeqs));
276                                 
277                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
278                                 
279                                 if (m->control_pressed) {  return 0; }
280                 #endif
281                                                 
282                                                                                 
283                 for(int i=0;i<fastaFileNames.size();i++){
284                         ifstream inFASTA;
285                         string seqName;
286                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
287                         ofstream outGroups;
288                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
289                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
290                         
291                         while(!inFASTA.eof()){
292                                 if(inFASTA.get() == '>'){
293                                         inFASTA >> seqName;
294                                         outGroups << seqName << '\t' << groupVector[i] << endl;
295                                 }
296                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
297                         }
298                         outGroups.close();
299                         inFASTA.close();
300                 }
301                 
302                 if (m->control_pressed) { 
303                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
304                         return 0;
305                 }
306
307                 m->mothurOutEndLine();
308                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
309                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
310                 m->mothurOutEndLine();
311                 
312                 return 0;       
313                         
314         }
315         catch(exception& e) {
316                 m->errorOut(e, "TrimSeqsCommand", "execute");
317                 exit(1);
318         }
319 }
320                 
321 /**************************************************************************************/
322 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
323         try {
324                 
325                 ofstream outFASTA;
326                 int able = openOutputFile(trimFile, outFASTA);
327                 
328                 ofstream scrapFASTA;
329                 openOutputFile(scrapFile, scrapFASTA);
330                 
331                 ofstream outGroups;
332                 vector<ofstream*> fastaFileNames;
333                 if (oligoFile != "") {          
334                         openOutputFile(groupFile, outGroups);   
335                         for (int i = 0; i < fastaNames.size(); i++) {
336                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
337                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
338                         #else
339                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate));                      
340                         #endif
341                         }
342                 }
343                 
344                 ifstream inFASTA;
345                 openInputFile(filename, inFASTA);
346                 
347                 ifstream qFile;
348                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
349                 
350                 qFile.seekg(qline->start);
351                 inFASTA.seekg(line->start);
352                 
353                 for(int i=0;i<line->num;i++){
354                                 
355                         if (m->control_pressed) { 
356                                 inFASTA.close(); 
357                                 outFASTA.close();
358                                 scrapFASTA.close();
359                                 if (oligoFile != "") {   outGroups.close();   }
360                                 if(qFileName != "")     {       qFile.close();  }
361                                 for(int i=0;i<fastaFileNames.size();i++){
362                                         fastaFileNames[i]->close();
363                                         delete fastaFileNames[i];
364                                 }       
365                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
366                                 return 0;
367                         }
368                         
369                         int success = 1;
370                         
371                         Sequence currSeq(inFASTA);
372
373                         string origSeq = currSeq.getUnaligned();
374                         if (origSeq != "") {
375                                 int group;
376                                 string trashCode = "";
377                                 int currentSeqsDiffs = 0;
378                                 
379                                 if(qFileName != ""){
380                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
381                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
382                                         
383                                         if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
384                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
385                                         }
386
387                                         if(!success)                    {       trashCode += 'q';                                                               }
388                                 }
389                         
390                                 if(barcodes.size() != 0){
391                                         success = stripBarcode(currSeq, group);
392                                         if(success > bdiffs){   trashCode += 'b';       }
393                                         else{ currentSeqsDiffs += success;  }
394                                 }
395
396                                 if(numFPrimers != 0){
397                                         success = stripForward(currSeq);
398                                         if(success > pdiffs){   trashCode += 'f';       }
399                                         else{ currentSeqsDiffs += success;  }
400                                 }
401                                 
402                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
403
404                                 if(numRPrimers != 0){
405                                         success = stripReverse(currSeq);
406                                         if(!success){   trashCode += 'r';       }
407                                 }
408                 
409                                 if(minLength > 0 || maxLength > 0){
410                                         success = cullLength(currSeq);
411                                         if(!success){   trashCode += 'l'; }
412                                 }
413                                 if(maxHomoP > 0){
414                                         success = cullHomoP(currSeq);
415                                         if(!success){   trashCode += 'h';       }
416                                 }
417                                 if(maxAmbig != -1){
418                                         success = cullAmbigs(currSeq);
419                                         if(!success){   trashCode += 'n';       }
420                                 }
421                                 
422                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
423                                 
424                                 if(trashCode.length() == 0){
425                                         currSeq.setAligned(currSeq.getUnaligned());
426                                         currSeq.printSequence(outFASTA);
427                                         if(barcodes.size() != 0){
428                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
429                                                 
430                                                 if(allFiles){
431                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
432                                                 }
433                                         }
434                                 }
435                                 else{
436                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
437                                         currSeq.setUnaligned(origSeq);
438                                         currSeq.setAligned(origSeq);
439                                         currSeq.printSequence(scrapFASTA);
440                                 }
441                         }
442                         gobble(inFASTA);
443                 }
444                 
445                 inFASTA.close();
446                 outFASTA.close();
447                 scrapFASTA.close();
448                 if (oligoFile != "") {   outGroups.close();   }
449                 if(qFileName != "")     {       qFile.close();  }
450                 
451                 for(int i=0;i<fastaFileNames.size();i++){
452                         fastaFileNames[i]->close();
453                         delete fastaFileNames[i];
454                 }               
455                 
456                 return 0;
457         }
458         catch(exception& e) {
459                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
460                 exit(1);
461         }
462 }
463 /**************************************************************************************************/
464 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
465         try {
466 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
467                 int process = 0;
468                 int exitCommand = 1;
469                 processIDS.clear();
470                 
471                 //loop through and create all the processes you want
472                 while (process != processors) {
473                         int pid = fork();
474                         
475                         if (pid > 0) {
476                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
477                                 process++;
478                         }else if (pid == 0){
479                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
480                                 exit(0);
481                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
482                 }
483                 
484                 //force parent to wait until all the processes are done
485                 for (int i=0;i<processors;i++) { 
486                         int temp = processIDS[i];
487                         wait(&temp);
488                 }
489                 
490                 return exitCommand;
491 #endif          
492         }
493         catch(exception& e) {
494                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
495                 exit(1);
496         }
497 }
498 /**************************************************************************************************/
499
500 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
501         try {
502                 
503                 lines.clear();
504                 
505                 vector<long int> positions;
506                 
507                 ifstream inFASTA;
508                 openInputFile(filename, inFASTA);
509                         
510                 string input;
511                 while(!inFASTA.eof()){  
512                         input = getline(inFASTA);
513
514                         if (input.length() != 0) {
515                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
516                         }
517                 }
518                 inFASTA.close();
519                 
520                 int numFastaSeqs = positions.size();
521         
522                 FILE * pFile;
523                 long size;
524                 
525                 //get num bytes in file
526                 pFile = fopen (filename.c_str(),"rb");
527                 if (pFile==NULL) perror ("Error opening file");
528                 else{
529                         fseek (pFile, 0, SEEK_END);
530                         size=ftell (pFile);
531                         fclose (pFile);
532                 }
533                 
534                 int numSeqsPerProcessor = numFastaSeqs / processors;
535                 
536                 for (int i = 0; i < processors; i++) {
537
538                         long int startPos = positions[ i * numSeqsPerProcessor ];
539                         if(i == processors - 1){
540                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
541                         }else{  
542                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
543                         }
544                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
545                 }
546                 
547                 return numFastaSeqs;
548         }
549         catch(exception& e) {
550                 m->errorOut(e, "TrimSeqsCommand", "setLines");
551                 exit(1);
552         }
553 }
554 //***************************************************************************************************************
555
556 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
557         try {
558                 ifstream inOligos;
559                 openInputFile(oligoFile, inOligos);
560                 
561                 ofstream test;
562                 
563                 string type, oligo, group;
564                 int index=0;
565                 
566                 while(!inOligos.eof()){
567                         inOligos >> type;
568                         
569                         if(type[0] == '#'){
570                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
571                         }
572                         else{
573                                 inOligos >> oligo;
574                                 
575                                 for(int i=0;i<oligo.length();i++){
576                                         oligo[i] = toupper(oligo[i]);
577                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
578                                 }
579                                 
580                                 if(type == "forward"){
581                                         forPrimer.push_back(oligo);
582                                 }
583                                 else if(type == "reverse"){
584                                         Sequence oligoRC("reverse", oligo);
585                                         oligoRC.reverseComplement();
586                                         revPrimer.push_back(oligoRC.getUnaligned());
587                                 }
588                                 else if(type == "barcode"){
589                                         inOligos >> group;
590                                         barcodes[oligo]=index++;
591                                         groupVector.push_back(group);
592                                         
593                                         if(allFiles){
594                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
595                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
596                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
597                                         }
598                                 }
599                         }
600                 }
601                 
602                 inOligos.close();
603                 
604                 numFPrimers = forPrimer.size();
605                 numRPrimers = revPrimer.size();
606                 
607         }
608         catch(exception& e) {
609                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
610                 exit(1);
611         }
612 }
613 //***************************************************************************************************************
614
615 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
616         try {
617                 
618                 string rawSequence = seq.getUnaligned();
619                 int success = bdiffs + 1;       //guilty until proven innocent
620                 
621                 //can you find the barcode
622                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
623                         string oligo = it->first;
624                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
625                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
626                                 break;  
627                         }
628                         
629                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
630                                 group = it->second;
631                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
632                                 success = 0;
633                                 break;
634                         }
635                 }
636                 
637                 //if you found the barcode or if you don't want to allow for diffs
638 //              cout << success;
639                 if ((bdiffs == 0) || (success == 0)) { return success;  }
640                 
641                 else { //try aligning and see if you can find it
642 //                      cout << endl;
643
644                         int maxLength = 0;
645
646                         Alignment* alignment;
647                         if (barcodes.size() > 0) {
648                                 map<string,int>::iterator it=barcodes.begin();
649
650                                 for(it;it!=barcodes.end();it++){
651                                         if(it->first.length() > maxLength){
652                                                 maxLength = it->first.length();
653                                         }
654                                 }
655                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
656
657                         }else{ alignment = NULL; } 
658                         
659                         //can you find the barcode
660                         int minDiff = 1e6;
661                         int minCount = 1;
662                         int minGroup = -1;
663                         int minPos = 0;
664                         
665                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
666                                 string oligo = it->first;
667 //                              int length = oligo.length();
668                                 
669                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
670                                         success = bdiffs + 10;
671                                         break;
672                                 }
673                                 
674                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
675                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
676                                 oligo = alignment->getSeqAAln();
677                                 string temp = alignment->getSeqBAln();
678                 
679                                 int alnLength = oligo.length();
680                                 
681                                 for(int i=oligo.length()-1;i>=0;i--){
682                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
683                                 }
684                                 oligo = oligo.substr(0,alnLength);
685                                 temp = temp.substr(0,alnLength);
686                                 
687                                 int newStart=0;
688                                 int numDiff = countDiffs(oligo, temp);
689                                 
690 //                              cout << oligo << '\t' << temp << '\t' << numDiff << endl;                               
691                                 
692                                 if(numDiff < minDiff){
693                                         minDiff = numDiff;
694                                         minCount = 1;
695                                         minGroup = it->second;
696                                         minPos = 0;
697                                         for(int i=0;i<alnLength;i++){
698                                                 if(temp[i] != '-'){
699                                                         minPos++;
700                                                 }
701                                         }
702                                 }
703                                 else if(numDiff == minDiff){
704                                         minCount++;
705                                 }
706
707                         }
708
709                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
710                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
711                         else{                                                                                                   //use the best match
712                                 group = minGroup;
713                                 seq.setUnaligned(rawSequence.substr(minPos));
714                                 success = minDiff;
715                         }
716                         
717                         if (alignment != NULL) {  delete alignment;  }
718                         
719                 }
720 //              cout << success << endl;
721                 
722                 return success;
723                 
724         }
725         catch(exception& e) {
726                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
727                 exit(1);
728         }
729
730 }
731
732 //***************************************************************************************************************
733
734 int TrimSeqsCommand::stripForward(Sequence& seq){
735         try {
736                 string rawSequence = seq.getUnaligned();
737                 int success = pdiffs + 1;       //guilty until proven innocent
738                 
739                 //can you find the primer
740                 for(int i=0;i<numFPrimers;i++){
741                         string oligo = forPrimer[i];
742
743                         if(rawSequence.length() < oligo.length()){      
744                                 success = pdiffs + 1;
745                                 break;
746                         }
747                         
748                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
749                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
750                                 success = 0;
751                                 break;
752                         }
753                 }
754                 
755                 //if you found the barcode or if you don't want to allow for diffs
756 //              cout << success;
757                 if ((pdiffs == 0) || (success == 0)) { return success;  }
758                 
759                 else { //try aligning and see if you can find it
760 //                      cout << endl;
761
762                         int maxLength = 0;
763
764                         Alignment* alignment;
765                         if (numFPrimers > 0) {
766
767                                 for(int i=0;i<numFPrimers;i++){
768                                         if(forPrimer[i].length() > maxLength){
769                                                 maxLength = forPrimer[i].length();
770                                         }
771                                 }
772                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
773
774                         }else{ alignment = NULL; } 
775                         
776                         //can you find the barcode
777                         int minDiff = 1e6;
778                         int minCount = 1;
779                         int minPos = 0;
780                         
781                         for(int i=0;i<numFPrimers;i++){
782                                 string oligo = forPrimer[i];
783                                 
784                                 if(rawSequence.length() < maxLength){   
785                                         success = pdiffs + 100;
786                                         break;
787                                 }
788                                 
789                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
790                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
791                                 oligo = alignment->getSeqAAln();
792                                 string temp = alignment->getSeqBAln();
793                 
794                                 int alnLength = oligo.length();
795                                 
796                                 for(int i=oligo.length()-1;i>=0;i--){
797                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
798                                 }
799                                 oligo = oligo.substr(0,alnLength);
800                                 temp = temp.substr(0,alnLength);
801
802                                 int newStart=0;
803                                 int numDiff = countDiffs(oligo, temp);
804                                 if(numDiff < minDiff){
805                                         minDiff = numDiff;
806                                         minCount = 1;
807                                         minPos = 0;
808                                         for(int i=0;i<alnLength;i++){
809                                                 if(temp[i] != '-'){
810                                                         minPos++;
811                                                 }
812                                         }
813                                 }
814                                 else if(numDiff == minDiff){
815                                         minCount++;
816                                 }
817
818                         }
819                         if(minDiff > pdiffs)    {       success =  minDiff;             }
820                         else if(minCount > 1)   {       success =  pdiffs + 10; }
821                         else{
822                                 seq.setUnaligned(rawSequence.substr(minPos));
823                                 success = minDiff;
824                         }
825                         
826                         if (alignment != NULL) {  delete alignment;  }
827                         
828                 }
829                 return success;
830
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
834                 exit(1);
835         }
836 }
837
838 //***************************************************************************************************************
839
840 bool TrimSeqsCommand::stripReverse(Sequence& seq){
841         try {
842                 string rawSequence = seq.getUnaligned();
843                 bool success = 0;       //guilty until proven innocent
844                 
845                 for(int i=0;i<numRPrimers;i++){
846                         string oligo = revPrimer[i];
847                         
848                         if(rawSequence.length() < oligo.length()){
849                                 success = 0;
850                                 break;
851                         }
852                         
853                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
854                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
855                                 success = 1;
856                                 break;
857                         }
858                 }       
859                 return success;
860                 
861         }
862         catch(exception& e) {
863                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
864                 exit(1);
865         }
866 }
867
868 //***************************************************************************************************************
869
870 bool TrimSeqsCommand::cullLength(Sequence& seq){
871         try {
872         
873                 int length = seq.getNumBases();
874                 bool success = 0;       //guilty until proven innocent
875                 
876                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
877                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
878                 else                                                                                            {       success = 0;    }
879                 
880                 return success;
881         
882         }
883         catch(exception& e) {
884                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
885                 exit(1);
886         }
887         
888 }
889
890 //***************************************************************************************************************
891
892 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
893         try {
894                 int longHomoP = seq.getLongHomoPolymer();
895                 bool success = 0;       //guilty until proven innocent
896                 
897                 if(longHomoP <= maxHomoP){      success = 1;    }
898                 else                                    {       success = 0;    }
899                 
900                 return success;
901         }
902         catch(exception& e) {
903                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
904                 exit(1);
905         }
906         
907 }
908
909 //***************************************************************************************************************
910
911 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
912         try {
913                 int numNs = seq.getAmbigBases();
914                 bool success = 0;       //guilty until proven innocent
915                 
916                 if(numNs <= maxAmbig)   {       success = 1;    }
917                 else                                    {       success = 0;    }
918                 
919                 return success;
920         }
921         catch(exception& e) {
922                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
923                 exit(1);
924         }
925         
926 }
927
928 //***************************************************************************************************************
929
930 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
931         try {
932                 bool success = 1;
933                 int length = oligo.length();
934                 
935                 for(int i=0;i<length;i++){
936                         
937                         if(oligo[i] != seq[i]){
938                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
939                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
940                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
941                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
942                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
943                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
944                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
945                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
946                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
947                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
948                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
949                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
950                                 
951                                 if(success == 0)        {       break;   }
952                         }
953                         else{
954                                 success = 1;
955                         }
956                 }
957                 
958                 return success;
959         }
960         catch(exception& e) {
961                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
962                 exit(1);
963         }
964
965 }
966 //***************************************************************************************************************
967
968 int TrimSeqsCommand::countDiffs(string oligo, string seq){
969         try {
970
971                 int length = oligo.length();
972                 int countDiffs = 0;
973                 
974                 for(int i=0;i<length;i++){
975                                                                 
976                         if(oligo[i] != seq[i]){
977                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
978                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
979                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
980                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
981                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
982                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
983                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
984                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
985                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
986                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
987                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
988                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
989                                 
990                         }
991                 }
992                 
993                 return countDiffs;
994         }
995         catch(exception& e) {
996                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
997                 exit(1);
998         }
999
1000 }
1001 //***************************************************************************************************************
1002
1003 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
1004         try {
1005 //              string rawSequence = seq.getUnaligned();
1006 //              int seqLength;  // = rawSequence.length();
1007 //              string name, temp, temp2;
1008 //              
1009 //              qFile >> name;
1010 //              
1011 //              //get rest of line
1012 //              temp = "";
1013 //              while (!qFile.eof())    {       
1014 //                      char c = qFile.get(); 
1015 //                      if (c == 10 || c == 13){        break;  }       
1016 //                      else { temp += c; }
1017 //              } 
1018 //      
1019 //              int pos = temp.find("length");
1020 //              if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine();  seqLength = 0;  }
1021 //              else {
1022 //                      string tempLength = temp.substr(pos);
1023 //                      istringstream iss (tempLength,istringstream::in);
1024 //                      iss >> temp;
1025 //              }
1026 //              
1027 //              splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1028 //              convert(temp, seqLength); //converts string to int
1029 //      
1030 //              if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1031                 
1032                 string rawSequence = seq.getUnaligned();
1033                 int seqLength = seq.getNumBases();
1034                 bool success = 0;       //guilty until proven innocent
1035                 string name;
1036                 
1037                 qFile >> name;
1038                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1039                 
1040                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1041                 
1042                 int score;
1043                 int end = seqLength;
1044                 
1045                 for(int i=0;i<seqLength;i++){
1046                         qFile >> score;
1047                         
1048                         if(score < qThreshold){
1049                                 end = i;
1050                                 break;
1051                         }
1052                 }
1053                 for(int i=end+1;i<seqLength;i++){
1054                         qFile >> score;
1055                 }
1056                 
1057                 seq.setUnaligned(rawSequence.substr(0,end));
1058                 
1059                 return 1;
1060         }
1061         catch(exception& e) {
1062                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1063                 exit(1);
1064         }
1065 }
1066
1067 //***************************************************************************************************************
1068
1069 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1070         try {
1071                 string rawSequence = seq.getUnaligned();
1072                 int seqLength = seq.getNumBases();
1073                 bool success = 0;       //guilty until proven innocent
1074                 string name;
1075                 
1076                 qFile >> name;
1077                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1078                 
1079                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1080                 
1081                 float score;    
1082                 float average = 0;
1083                 
1084                 for(int i=0;i<seqLength;i++){
1085                         qFile >> score;
1086                         average += score;
1087                 }
1088                 average /= seqLength;
1089
1090                 if(average >= qAverage) {       success = 1;    }
1091                 else                                    {       success = 0;    }
1092                 
1093                 return success;
1094         }
1095         catch(exception& e) {
1096                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1097                 exit(1);
1098         }
1099 }
1100
1101 //***************************************************************************************************************