]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
a544733375b004e651409ef47c9db62bb0f0756d
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter .... The default is 0.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         openInputFile(fastaFile, inFASTA);
228                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
229                                         inFASTA.close();
230                                         
231                                         lines.push_back(new linePair(0, numSeqs));
232                                         
233                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
234                                         
235                                         for (int j = 0; j < fastaFileNames.size(); j++) {
236                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
237                                         }
238
239                                 }else{
240                                         setLines(fastaFile, lines);     
241                                         if(qFileName == "")     {       qLines = lines; }       
242                                                                 
243                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
244                                         
245                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
246                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
247                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
248                                         for (int j = 0; j < fastaFileNames.size(); j++) {
249                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
250                                         }
251                                         //append files
252                                         for(int i=1;i<processors;i++){
253                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
254                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
255                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
256                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
257                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
258                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
259                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
260                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
261                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
262                                                 }
263                                         }
264                                 }
265                                 
266                                 if (m->control_pressed) {  return 0; }
267                 #else
268                                 ifstream inFASTA;
269                                 openInputFile(fastafileNames[s], inFASTA);
270                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
271                                 inFASTA.close();
272                                 
273                                 lines.push_back(new linePair(0, numSeqs));
274                                 
275                                 driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
276                                 
277                                 if (m->control_pressed) {  return 0; }
278                 #endif
279                                                 
280                                                                                 
281                 for(int i=0;i<fastaFileNames.size();i++){
282                         ifstream inFASTA;
283                         string seqName;
284                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
285                         ofstream outGroups;
286                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
287                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
288                         
289                         while(!inFASTA.eof()){
290                                 if(inFASTA.get() == '>'){
291                                         inFASTA >> seqName;
292                                         outGroups << seqName << '\t' << groupVector[i] << endl;
293                                 }
294                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
295                         }
296                         outGroups.close();
297                         inFASTA.close();
298                 }
299                 
300                 if (m->control_pressed) { 
301                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
302                         return 0;
303                 }
304
305                 m->mothurOutEndLine();
306                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
307                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
308                 m->mothurOutEndLine();
309                 
310                 return 0;       
311                         
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "TrimSeqsCommand", "execute");
315                 exit(1);
316         }
317 }
318                 
319 /**************************************************************************************/
320 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
321         try {
322                 
323                 ofstream outFASTA;
324                 int able = openOutputFile(trimFile, outFASTA);
325                 
326                 ofstream scrapFASTA;
327                 openOutputFile(scrapFile, scrapFASTA);
328                 
329                 ofstream outGroups;
330                 vector<ofstream*> fastaFileNames;
331                 if (oligoFile != "") {          
332                         openOutputFile(groupFile, outGroups);   
333                         for (int i = 0; i < fastaNames.size(); i++) {
334                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
335                         }
336                 }
337                 
338                 ifstream inFASTA;
339                 openInputFile(filename, inFASTA);
340                 
341                 ifstream qFile;
342                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
343                 
344                 qFile.seekg(qline->start);
345                 inFASTA.seekg(line->start);
346                 
347                 for(int i=0;i<line->num;i++){
348                                 
349                         if (m->control_pressed) { 
350                                 inFASTA.close(); 
351                                 outFASTA.close();
352                                 scrapFASTA.close();
353                                 if (oligoFile != "") {   outGroups.close();   }
354                                 if(qFileName != "")     {       qFile.close();  }
355                                 for(int i=0;i<fastaFileNames.size();i++){
356                                         fastaFileNames[i]->close();
357                                         delete fastaFileNames[i];
358                                 }       
359                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
360                                 return 0;
361                         }
362                         
363                         bool success = 1;
364                         
365                         Sequence currSeq(inFASTA);
366
367                         string origSeq = currSeq.getUnaligned();
368                         if (origSeq != "") {
369                                 int group;
370                                 string trashCode = "";
371                                 int currentSeqsDiffs = 0;
372                                 
373                                 if(qFileName != ""){
374                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
375                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
376                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
377                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
378                                         }
379                                         if(!success)                    {       trashCode += 'q';                                                               }
380                                 }
381                         
382                                 if(barcodes.size() != 0){
383                                         success = stripBarcode(currSeq, group);
384 //                                      cout << "here: " << success << endl;
385                                         if(success > bdiffs){   trashCode += 'b';       }
386                                         else{ currentSeqsDiffs += success;  }
387                                 }
388
389                                 if(numFPrimers != 0){
390                                         success = stripForward(currSeq);
391                                         if(success > pdiffs){   trashCode += 'f';       }
392                                         else{ currentSeqsDiffs += success;  }
393                                 }
394                                 
395                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
396
397                                 if(numRPrimers != 0){
398                                         success = stripReverse(currSeq);
399                                         if(!success){   trashCode += 'r';       }
400                                 }
401                 
402                                 if(minLength > 0 || maxLength > 0){
403                                         success = cullLength(currSeq);
404                                         if(!success){   trashCode += 'l'; }
405                                 }
406                                 if(maxHomoP > 0){
407                                         success = cullHomoP(currSeq);
408                                         if(!success){   trashCode += 'h';       }
409                                 }
410                                 if(maxAmbig != -1){
411                                         success = cullAmbigs(currSeq);
412                                         if(!success){   trashCode += 'n';       }
413                                 }
414                                 
415                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
416                                 
417                                 if(trashCode.length() == 0){
418                                         currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
419                                         currSeq.printSequence(outFASTA);
420                                         if(barcodes.size() != 0){
421                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
422                                                 
423                                                 if(allFiles){
424                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
425                                                 }
426                                         }
427                                 }
428                                 else{
429                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
430                                         currSeq.setUnaligned(origSeq);
431                                         currSeq.printSequence(scrapFASTA);
432                                 }
433                         }
434                         gobble(inFASTA);
435                 }
436                 
437                 inFASTA.close();
438                 outFASTA.close();
439                 scrapFASTA.close();
440                 if (oligoFile != "") {   outGroups.close();   }
441                 if(qFileName != "")     {       qFile.close();  }
442                 
443                 for(int i=0;i<fastaFileNames.size();i++){
444                         fastaFileNames[i]->close();
445                         delete fastaFileNames[i];
446                 }               
447                 
448                 return 0;
449         }
450         catch(exception& e) {
451                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
452                 exit(1);
453         }
454 }
455 /**************************************************************************************************/
456 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
457         try {
458 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
459                 int process = 0;
460                 int exitCommand = 1;
461                 processIDS.clear();
462                 
463                 //loop through and create all the processes you want
464                 while (process != processors) {
465                         int pid = fork();
466                         
467                         if (pid > 0) {
468                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
469                                 process++;
470                         }else if (pid == 0){
471                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
472                                 exit(0);
473                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
474                 }
475                 
476                 //force parent to wait until all the processes are done
477                 for (int i=0;i<processors;i++) { 
478                         int temp = processIDS[i];
479                         wait(&temp);
480                 }
481                 
482                 return exitCommand;
483 #endif          
484         }
485         catch(exception& e) {
486                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
487                 exit(1);
488         }
489 }
490 /**************************************************************************************************/
491
492 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
493         try {
494                 
495                 lines.clear();
496                 
497                 vector<long int> positions;
498                 
499                 ifstream inFASTA;
500                 openInputFile(filename, inFASTA);
501                         
502                 string input;
503                 while(!inFASTA.eof()){  
504                         input = getline(inFASTA);
505
506                         if (input.length() != 0) {
507                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
508                         }
509                 }
510                 inFASTA.close();
511                 
512                 int numFastaSeqs = positions.size();
513         
514                 FILE * pFile;
515                 long size;
516                 
517                 //get num bytes in file
518                 pFile = fopen (filename.c_str(),"rb");
519                 if (pFile==NULL) perror ("Error opening file");
520                 else{
521                         fseek (pFile, 0, SEEK_END);
522                         size=ftell (pFile);
523                         fclose (pFile);
524                 }
525                 
526                 int numSeqsPerProcessor = numFastaSeqs / processors;
527                 
528                 for (int i = 0; i < processors; i++) {
529
530                         long int startPos = positions[ i * numSeqsPerProcessor ];
531                         if(i == processors - 1){
532                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
533                         }else{  
534                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
535                         }
536                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
537                 }
538                 
539                 return numFastaSeqs;
540         }
541         catch(exception& e) {
542                 m->errorOut(e, "TrimSeqsCommand", "setLines");
543                 exit(1);
544         }
545 }
546 //***************************************************************************************************************
547
548 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
549         try {
550                 ifstream inOligos;
551                 openInputFile(oligoFile, inOligos);
552                 
553                 ofstream test;
554                 
555                 string type, oligo, group;
556                 int index=0;
557                 
558                 while(!inOligos.eof()){
559                         inOligos >> type;
560                         
561                         if(type[0] == '#'){
562                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
563                         }
564                         else{
565                                 inOligos >> oligo;
566                                 
567                                 for(int i=0;i<oligo.length();i++){
568                                         oligo[i] = toupper(oligo[i]);
569                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
570                                 }
571                                 
572                                 if(type == "forward"){
573                                         forPrimer.push_back(oligo);
574                                 }
575                                 else if(type == "reverse"){
576                                         Sequence oligoRC("reverse", oligo);
577                                         oligoRC.reverseComplement();
578                                         revPrimer.push_back(oligoRC.getUnaligned());
579                                 }
580                                 else if(type == "barcode"){
581                                         inOligos >> group;
582                                         barcodes[oligo]=index++;
583                                         groupVector.push_back(group);
584                                         
585                                         if(allFiles){
586                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
587                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
588                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
589                                         }
590                                 }
591                         }
592                 }
593                 
594                 inOligos.close();
595                 
596                 numFPrimers = forPrimer.size();
597                 numRPrimers = revPrimer.size();
598                 
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
602                 exit(1);
603         }
604 }
605 //***************************************************************************************************************
606
607 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
608         try {
609                 string rawSequence = seq.getUnaligned();
610                 bool success = bdiffs + 1;      //guilty until proven innocent
611                 
612                 //can you find the barcode
613                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
614                         string oligo = it->first;
615                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
616                                 success = bdiffs + 1;
617                                 break;
618                         }
619                         
620                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
621                                 group = it->second;
622                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
623                                 success = 0;
624                                 break;
625                         }
626                 }
627                 
628                 //if you found the barcode or if you don't want to allow for diffs
629 //              cout << success;
630                 if ((bdiffs == 0) || (success == 0)) { return success;  }
631                 
632                 else { //try aligning and see if you can find it
633 //                      cout << endl;
634
635                         int maxLength = 0;
636
637                         Alignment* alignment;
638                         if (barcodes.size() > 0) {
639                                 map<string,int>::iterator it=barcodes.begin();
640
641                                 for(it;it!=barcodes.end();it++){
642                                         if(it->first.length() > maxLength){
643                                                 maxLength = it->first.length();
644                                         }
645                                 }
646                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
647
648                         }else{ alignment = NULL; } 
649                         
650                         //can you find the barcode
651                         int minDiff = 1e6;
652                         int minCount = 1;
653                         int minGroup = -1;
654                         int minPos = 0;
655                         
656                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
657                                 string oligo = it->first;
658 //                              int length = oligo.length();
659                                 
660                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
661                                         success = bdiffs + 1;
662                                         break;
663                                 }
664                                 
665                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
666                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
667                                 oligo = alignment->getSeqAAln();
668                                 string temp = alignment->getSeqBAln();
669                 
670                                 int alnLength = oligo.length();
671                                 
672                                 for(int i=oligo.length()-1;i>=0;i--){
673                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
674                                 }
675                                 oligo = oligo.substr(0,alnLength);
676                                 temp = temp.substr(0,alnLength);
677 //                              cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,alnLength) << " raw aligned = " << temp << endl;                     
678                                 cout << seq.getName() << endl;
679                                 cout << temp << endl;
680                                 cout << oligo << endl;
681                                 cout << alnLength << endl;
682                                 cout << endl;
683                                 
684                                 int newStart=0;
685                                 int numDiff = countDiffs(oligo, temp);
686                                 if(numDiff < minDiff){
687                                         minDiff = numDiff;
688                                         minCount = 1;
689                                         minGroup = it->second;
690                                         minPos = 0;
691                                         for(int i=0;i<alnLength;i++){
692                                                 if(temp[i] != '-'){
693                                                         minPos++;
694                                                 }
695                                         }
696                                 }
697                                 else if(numDiff == minDiff){
698                                         minCount++;
699                                 }
700
701                         }
702                         if(minDiff > bdiffs){   success =  bdiffs + 1;  }
703                         else if(minCount > 1)   {       success =  bdiffs + 1;  }
704                         else{
705                                 group = minGroup;
706                                 seq.setUnaligned("*" + rawSequence.substr(minPos));
707                                 success = minDiff;
708                         }
709                         
710                         if (alignment != NULL) {  delete alignment;  }
711                         
712                 }
713                 return success;
714                 
715         }
716         catch(exception& e) {
717                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
718                 exit(1);
719         }
720
721 }
722
723 //***************************************************************************************************************
724
725 int TrimSeqsCommand::stripForward(Sequence& seq){
726         try {
727                 string rawSequence = seq.getUnaligned();
728                 bool success = pdiffs + 1;      //guilty until proven innocent
729                 
730                 //can you find the primer
731                 for(int i=0;i<numFPrimers;i++){
732                         string oligo = forPrimer[i];
733
734                         if(rawSequence.length() < oligo.length()){      
735                                 success = pdiffs + 1;
736                                 break;
737                         }
738                         
739                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
740                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
741                                 success = 0;
742                                 break;
743                         }
744                 }
745                 
746                 //if you found the barcode or if you don't want to allow for diffs
747 //              cout << success;
748                 if ((pdiffs == 0) || (success == 0)) { return success;  }
749                 
750                 else { //try aligning and see if you can find it
751 //                      cout << endl;
752
753                         int maxLength = 0;
754
755                         Alignment* alignment;
756                         if (numFPrimers > 0) {
757
758                                 for(int i=0;i<numFPrimers;i++){
759                                         if(forPrimer[i].length() > maxLength){
760                                                 maxLength = forPrimer[i].length();
761                                         }
762                                 }
763                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1));  
764
765                         }else{ alignment = NULL; } 
766                         
767                         //can you find the barcode
768                         int minDiff = 1e6;
769                         int minCount = 1;
770                         int minPos = 0;
771                         
772                         for(int i=0;i<numFPrimers;i++){
773                                 string oligo = forPrimer[i];
774                                 
775                                 if(rawSequence.length() < maxLength){   
776                                         success = pdiffs + 1;
777                                         break;
778                                 }
779                                 
780                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
781                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
782                                 oligo = alignment->getSeqAAln();
783                                 string temp = alignment->getSeqBAln();
784                 
785                                 int alnLength = oligo.length();
786                                 
787                                 for(int i=oligo.length()-1;i>=0;i--){
788                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
789                                 }
790                                 oligo = oligo.substr(0,alnLength);
791                                 temp = temp.substr(0,alnLength);
792 //                              cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,alnLength) << " raw aligned = " << temp << endl;                     
793                                 cout << seq.getName() << endl;
794                                 cout << temp << endl;
795                                 cout << oligo << endl;
796                                 cout << alnLength << endl;
797                                 cout << endl;
798                                 
799                                 int newStart=0;
800                                 int numDiff = countDiffs(oligo, temp);
801                                 if(numDiff < minDiff){
802                                         minDiff = numDiff;
803                                         minCount = 1;
804                                         minPos = 0;
805                                         for(int i=0;i<alnLength;i++){
806                                                 if(temp[i] != '-'){
807                                                         minPos++;
808                                                 }
809                                         }
810                                 }
811                                 else if(numDiff == minDiff){
812                                         minCount++;
813                                 }
814
815                         }
816                         if(minDiff > pdiffs){   success =  pdiffs + 1;  }
817                         else if(minCount > 1)   {       success =  pdiffs + 1;  }
818                         else{
819                                 seq.setUnaligned("*" + rawSequence.substr(minPos));
820                                 success = minDiff;
821                         }
822                         
823                         if (alignment != NULL) {  delete alignment;  }
824                         
825                 }
826                 return success;
827
828         }
829         catch(exception& e) {
830                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
831                 exit(1);
832         }
833 }
834
835 //***************************************************************************************************************
836
837 bool TrimSeqsCommand::stripReverse(Sequence& seq){
838         try {
839                 string rawSequence = seq.getUnaligned();
840                 bool success = 0;       //guilty until proven innocent
841                 
842                 for(int i=0;i<numRPrimers;i++){
843                         string oligo = revPrimer[i];
844                         
845                         if(rawSequence.length() < oligo.length()){
846                                 success = 0;
847                                 break;
848                         }
849                         
850                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
851                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
852                                 success = 1;
853                                 break;
854                         }
855                 }       
856                 return success;
857                 
858         }
859         catch(exception& e) {
860                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
861                 exit(1);
862         }
863 }
864
865 //***************************************************************************************************************
866
867 bool TrimSeqsCommand::cullLength(Sequence& seq){
868         try {
869         
870                 int length = seq.getNumBases();
871                 bool success = 0;       //guilty until proven innocent
872                 
873                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
874                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
875                 else                                                                                            {       success = 0;    }
876                 
877                 return success;
878         
879         }
880         catch(exception& e) {
881                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
882                 exit(1);
883         }
884         
885 }
886
887 //***************************************************************************************************************
888
889 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
890         try {
891                 int longHomoP = seq.getLongHomoPolymer();
892                 bool success = 0;       //guilty until proven innocent
893                 
894                 if(longHomoP <= maxHomoP){      success = 1;    }
895                 else                                    {       success = 0;    }
896                 
897                 return success;
898         }
899         catch(exception& e) {
900                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
901                 exit(1);
902         }
903         
904 }
905
906 //***************************************************************************************************************
907
908 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
909         try {
910                 int numNs = seq.getAmbigBases();
911                 bool success = 0;       //guilty until proven innocent
912                 
913                 if(numNs <= maxAmbig)   {       success = 1;    }
914                 else                                    {       success = 0;    }
915                 
916                 return success;
917         }
918         catch(exception& e) {
919                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
920                 exit(1);
921         }
922         
923 }
924
925 //***************************************************************************************************************
926
927 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
928         try {
929                 bool success = 1;
930                 int length = oligo.length();
931                 
932                 for(int i=0;i<length;i++){
933                         
934                         if(oligo[i] != seq[i]){
935                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
936                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
937                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
938                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
939                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
940                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
941                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
942                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
943                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
944                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
945                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
946                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
947                                 
948                                 if(success == 0)        {       break;   }
949                         }
950                         else{
951                                 success = 1;
952                         }
953                 }
954                 
955                 return success;
956         }
957         catch(exception& e) {
958                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
959                 exit(1);
960         }
961
962 }
963 //***************************************************************************************************************
964
965 int TrimSeqsCommand::countDiffs(string oligo, string seq){
966         try {
967
968                 int length = oligo.length();
969                 int countDiffs = 0;
970                 
971                 for(int i=0;i<length;i++){
972                                                                 
973                         if(oligo[i] != seq[i]){
974                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
975                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
976                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
977                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
978                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
979                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
980                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
981                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
982                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
983                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
984                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
985                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
986                                 
987                         }
988                 }
989                 
990                 return countDiffs;
991         }
992         catch(exception& e) {
993                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
994                 exit(1);
995         }
996
997 }
998 //***************************************************************************************************************
999
1000 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
1001         try {
1002                 string rawSequence = seq.getUnaligned();
1003                 int seqLength;  // = rawSequence.length();
1004                 string name, temp, temp2;
1005                 
1006                 qFile >> name >> temp;
1007         
1008                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1009                 convert(temp, seqLength); //converts string to int
1010         
1011                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1012                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1013                 
1014                 int score;
1015                 int end = seqLength;
1016                 
1017                 for(int i=0;i<seqLength;i++){
1018                         qFile >> score;
1019                         
1020                         if(score <= qThreshold){
1021                                 end = i;
1022                                 break;
1023                         }
1024                 }
1025                 for(int i=end+1;i<seqLength;i++){
1026                         qFile >> score;
1027                 }
1028                 
1029                 seq.setUnaligned(rawSequence.substr(0,end));
1030                 
1031                 return 1;
1032         }
1033         catch(exception& e) {
1034                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1035                 exit(1);
1036         }
1037 }
1038
1039 //***************************************************************************************************************
1040
1041 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1042         try {
1043                 string rawSequence = seq.getUnaligned();
1044                 int seqLength = seq.getNumBases();
1045                 bool success = 0;       //guilty until proven innocent
1046                 string name;
1047                 
1048                 qFile >> name;
1049                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1050                 
1051                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1052                 
1053                 float score;    
1054                 float average = 0;
1055                 
1056                 for(int i=0;i<seqLength;i++){
1057                         qFile >> score;
1058                         average += score;
1059                 }
1060                 average /= seqLength;
1061
1062                 if(average >= qAverage) {       success = 1;    }
1063                 else                                    {       success = 0;    }
1064                 
1065                 return success;
1066         }
1067         catch(exception& e) {
1068                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1069                 exit(1);
1070         }
1071 }
1072
1073 //***************************************************************************************************************