]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
pds - modified strip barcode funcrtion in trimseqscommand
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { temp = "0"; }
108                         convert(temp, tdiffs);
109                         
110                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
111                         convert(temp, bdiffs);
112                         
113                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
114                         convert(temp, pdiffs);
115                         
116                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
117                         
118                         temp = validParameter.validFile(parameters, "qfile", true);     
119                         if (temp == "not found")        {       qFileName = "";         }
120                         else if(temp == "not open")     {       abort = true;           }
121                         else                                            {       qFileName = temp;       }
122                         
123                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
124                         convert(temp, qThreshold);
125                         
126                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
127                         qtrim = isTrue(temp);
128
129                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
130                         convert(temp, qAverage);
131                         
132                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
133                         allFiles = isTrue(temp);
134                         
135                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
136                         convert(temp, processors); 
137                         
138                         if(allFiles && oligoFile == ""){
139                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
140                         }
141                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
142                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
143                                 qAverage=0;
144                                 qThreshold=0;
145                         }
146                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
147                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
148                                 abort = true;
149                         }
150                 }
151
152         }
153         catch(exception& e) {
154                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
155                 exit(1);
156         }
157 }
158 //**********************************************************************************************************************
159
160 void TrimSeqsCommand::help(){
161         try {
162                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
163                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
164                 m->mothurOut("The fasta parameter is required.\n");
165                 m->mothurOut("The flip parameter .... The default is 0.\n");
166                 m->mothurOut("The oligos parameter .... The default is "".\n");
167                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
168                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
169                 m->mothurOut("The minlength parameter .... The default is 0.\n");
170                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
171                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is 0.\n");
172                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
173                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
174                 m->mothurOut("The qfile parameter .....\n");
175                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
176                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
177                 m->mothurOut("The allfiles parameter .... The default is F.\n");
178                 m->mothurOut("The qtrim parameter .... The default is F.\n");
179                 m->mothurOut("The trim.seqs command should be in the following format: \n");
180                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
181                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
182                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
183                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
184                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
185
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "TrimSeqsCommand", "help");
189                 exit(1);
190         }
191 }
192
193
194 //***************************************************************************************************************
195
196 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
197
198 //***************************************************************************************************************
199
200 int TrimSeqsCommand::execute(){
201         try{
202         
203                 if (abort == true) { return 0; }
204                 
205                 numFPrimers = 0;  //this needs to be initialized
206                 numRPrimers = 0;
207                 
208                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
209                 outputNames.push_back(trimSeqFile);
210                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
211                 outputNames.push_back(scrapSeqFile);
212                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
213                 
214                 vector<string> fastaFileNames;
215                 if(oligoFile != ""){
216                         outputNames.push_back(groupFile);
217                         getOligos(fastaFileNames);
218                 }
219                 
220                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
221
222                 
223                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
224                                 if(processors == 1){
225                                         ifstream inFASTA;
226                                         openInputFile(fastaFile, inFASTA);
227                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
228                                         inFASTA.close();
229                                         
230                                         lines.push_back(new linePair(0, numSeqs));
231                                         
232                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
233                                         
234                                         for (int j = 0; j < fastaFileNames.size(); j++) {
235                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
236                                         }
237
238                                 }else{
239                                         setLines(fastaFile, lines);     
240                                         if(qFileName == "")     {       qLines = lines; }       
241                                                                 
242                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
243                                         
244                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
245                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
246                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
247                                         for (int j = 0; j < fastaFileNames.size(); j++) {
248                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
249                                         }
250                                         //append files
251                                         for(int i=1;i<processors;i++){
252                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
253                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
254                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
255                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
256                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
257                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
258                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
259                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
260                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
261                                                 }
262                                         }
263                                 }
264                                 
265                                 if (m->control_pressed) {  return 0; }
266                 #else
267                                 ifstream inFASTA;
268                                 openInputFile(fastafileNames[s], inFASTA);
269                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
270                                 inFASTA.close();
271                                 
272                                 lines.push_back(new linePair(0, numSeqs));
273                                 
274                                 driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
275                                 
276                                 if (m->control_pressed) {  return 0; }
277                 #endif
278                                                 
279                                                                                 
280                 for(int i=0;i<fastaFileNames.size();i++){
281                         ifstream inFASTA;
282                         string seqName;
283                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
284                         ofstream outGroups;
285                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
286                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
287                         
288                         while(!inFASTA.eof()){
289                                 if(inFASTA.get() == '>'){
290                                         inFASTA >> seqName;
291                                         outGroups << seqName << '\t' << groupVector[i] << endl;
292                                 }
293                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
294                         }
295                         outGroups.close();
296                         inFASTA.close();
297                 }
298                 
299                 if (m->control_pressed) { 
300                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
301                         return 0;
302                 }
303
304                 m->mothurOutEndLine();
305                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
306                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
307                 m->mothurOutEndLine();
308                 
309                 return 0;       
310                         
311         }
312         catch(exception& e) {
313                 m->errorOut(e, "TrimSeqsCommand", "execute");
314                 exit(1);
315         }
316 }
317                 
318 /**************************************************************************************/
319 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
320         try {
321                 
322                 ofstream outFASTA;
323                 int able = openOutputFile(trimFile, outFASTA);
324                 
325                 ofstream scrapFASTA;
326                 openOutputFile(scrapFile, scrapFASTA);
327                 
328                 ofstream outGroups;
329                 vector<ofstream*> fastaFileNames;
330                 if (oligoFile != "") {          
331                         openOutputFile(groupFile, outGroups);   
332                         for (int i = 0; i < fastaNames.size(); i++) {
333                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
334                         }
335                 }
336                 
337                 ifstream inFASTA;
338                 openInputFile(filename, inFASTA);
339                 
340                 ifstream qFile;
341                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
342                 
343                 qFile.seekg(qline->start);
344                 inFASTA.seekg(line->start);
345                 
346                 for(int i=0;i<line->num;i++){
347                                 
348                         if (m->control_pressed) { 
349                                 inFASTA.close(); 
350                                 outFASTA.close();
351                                 scrapFASTA.close();
352                                 if (oligoFile != "") {   outGroups.close();   }
353                                 if(qFileName != "")     {       qFile.close();  }
354                                 for(int i=0;i<fastaFileNames.size();i++){
355                                         fastaFileNames[i]->close();
356                                         delete fastaFileNames[i];
357                                 }       
358                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
359                                 return 0;
360                         }
361                         
362                         bool success = 1;
363                         
364                         Sequence currSeq(inFASTA);
365
366                         string origSeq = currSeq.getUnaligned();
367                         if (origSeq != "") {
368                                 int group;
369                                 string trashCode = "";
370                                 int currentSeqsDiffs = 0;
371                                 currentSeqsTdiffs = 0;
372                                 
373                                 if(qFileName != ""){
374                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
375                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
376                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
377                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
378                                         }
379                                         if(!success)                    {       trashCode += 'q';                                                               }
380                                 }
381                         
382                                 if(barcodes.size() != 0){
383                                         success = stripBarcode(currSeq, group);
384 //                                      cout << "here: " << success << endl;
385                                         if(!success){   trashCode += 'b';       }
386                                         else{ currentSeqsDiffs += currentSeqsTdiffs;  }
387                                 }
388
389                                 if(numFPrimers != 0){
390                                         success = stripForward(currSeq);
391                                         if(!success){   trashCode += 'f';       }
392                                         else{ currentSeqsDiffs += currentSeqsTdiffs;  }
393                                 }
394                                 
395                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
396
397                                 if(numRPrimers != 0){
398                                         success = stripReverse(currSeq);
399                                         if(!success){   trashCode += 'r';       }
400                                 }
401                 
402                                 if(minLength > 0 || maxLength > 0){
403                                         success = cullLength(currSeq);
404                                         if(!success){   trashCode += 'l'; }
405                                 }
406                                 if(maxHomoP > 0){
407                                         success = cullHomoP(currSeq);
408                                         if(!success){   trashCode += 'h';       }
409                                 }
410                                 if(maxAmbig != -1){
411                                         success = cullAmbigs(currSeq);
412                                         if(!success){   trashCode += 'n';       }
413                                 }
414                                 
415                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
416                                 
417                                 if(trashCode.length() == 0){
418                                         currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
419                                         currSeq.printSequence(outFASTA);
420                                         if(barcodes.size() != 0){
421                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
422                                                 
423                                                 if(allFiles){
424                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
425                                                 }
426                                         }
427                                 }
428                                 else{
429                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
430                                         currSeq.setUnaligned(origSeq);
431                                         currSeq.printSequence(scrapFASTA);
432                                 }
433                         }
434                         gobble(inFASTA);
435                 }
436                 
437                 inFASTA.close();
438                 outFASTA.close();
439                 scrapFASTA.close();
440                 if (oligoFile != "") {   outGroups.close();   }
441                 if(qFileName != "")     {       qFile.close();  }
442                 
443                 for(int i=0;i<fastaFileNames.size();i++){
444                         fastaFileNames[i]->close();
445                         delete fastaFileNames[i];
446                 }               
447                 
448                 return 0;
449         }
450         catch(exception& e) {
451                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
452                 exit(1);
453         }
454 }
455 /**************************************************************************************************/
456 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
457         try {
458 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
459                 int process = 0;
460                 int exitCommand = 1;
461                 processIDS.clear();
462                 
463                 //loop through and create all the processes you want
464                 while (process != processors) {
465                         int pid = fork();
466                         
467                         if (pid > 0) {
468                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
469                                 process++;
470                         }else if (pid == 0){
471                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
472                                 exit(0);
473                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
474                 }
475                 
476                 //force parent to wait until all the processes are done
477                 for (int i=0;i<processors;i++) { 
478                         int temp = processIDS[i];
479                         wait(&temp);
480                 }
481                 
482                 return exitCommand;
483 #endif          
484         }
485         catch(exception& e) {
486                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
487                 exit(1);
488         }
489 }
490 /**************************************************************************************************/
491
492 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
493         try {
494                 
495                 lines.clear();
496                 
497                 vector<long int> positions;
498                 
499                 ifstream inFASTA;
500                 openInputFile(filename, inFASTA);
501                         
502                 string input;
503                 while(!inFASTA.eof()){  
504                         input = getline(inFASTA);
505
506                         if (input.length() != 0) {
507                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
508                         }
509                 }
510                 inFASTA.close();
511                 
512                 int numFastaSeqs = positions.size();
513         
514                 FILE * pFile;
515                 long size;
516                 
517                 //get num bytes in file
518                 pFile = fopen (filename.c_str(),"rb");
519                 if (pFile==NULL) perror ("Error opening file");
520                 else{
521                         fseek (pFile, 0, SEEK_END);
522                         size=ftell (pFile);
523                         fclose (pFile);
524                 }
525                 
526                 int numSeqsPerProcessor = numFastaSeqs / processors;
527                 
528                 for (int i = 0; i < processors; i++) {
529
530                         long int startPos = positions[ i * numSeqsPerProcessor ];
531                         if(i == processors - 1){
532                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
533                         }else{  
534                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
535                         }
536                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
537                 }
538                 
539                 return numFastaSeqs;
540         }
541         catch(exception& e) {
542                 m->errorOut(e, "TrimSeqsCommand", "setLines");
543                 exit(1);
544         }
545 }
546 //***************************************************************************************************************
547
548 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
549         try {
550                 ifstream inOligos;
551                 openInputFile(oligoFile, inOligos);
552                 
553                 ofstream test;
554                 
555                 string type, oligo, group;
556                 int index=0;
557                 
558                 while(!inOligos.eof()){
559                         inOligos >> type;
560                         
561                         if(type[0] == '#'){
562                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
563                         }
564                         else{
565                                 inOligos >> oligo;
566                                 
567                                 for(int i=0;i<oligo.length();i++){
568                                         oligo[i] = toupper(oligo[i]);
569                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
570                                 }
571                                 
572                                 if(type == "forward"){
573                                         forPrimer.push_back(oligo);
574                                 }
575                                 else if(type == "reverse"){
576                                         Sequence oligoRC("reverse", oligo);
577                                         oligoRC.reverseComplement();
578                                         revPrimer.push_back(oligoRC.getUnaligned());
579                                 }
580                                 else if(type == "barcode"){
581                                         inOligos >> group;
582                                         barcodes[oligo]=index++;
583                                         groupVector.push_back(group);
584                                         
585                                         if(allFiles){
586                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
587                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
588                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
589                                         }
590                                 }
591                         }
592                 }
593                 
594                 inOligos.close();
595                 
596                 numFPrimers = forPrimer.size();
597                 numRPrimers = revPrimer.size();
598                 
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
602                 exit(1);
603         }
604 }
605 //***************************************************************************************************************
606
607 bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
608         try {
609                 string rawSequence = seq.getUnaligned();
610                 bool success = 0;       //guilty until proven innocent
611                 
612                 //can you find the barcode
613                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
614                         string oligo = it->first;
615                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
616                                 success = 0;
617                                 break;
618                         }
619                         
620                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
621                                 group = it->second;
622                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
623                                 success = 1;
624                                 break;
625                         }
626                 }
627                 
628                 //if you found the barcode or if you don't want to allow for diffs
629 //              cout << success;
630                 if ((bdiffs == 0) || (success == 1)) { return success;  }
631                 
632                 else { //try aligning and see if you can find it
633 //                      cout << endl;
634
635                         int maxLength = 0;
636
637                         Alignment* alignment;
638                         if (barcodes.size() > 0) {
639                                 map<string,int>::iterator it=barcodes.begin();
640
641                                 for(it;it!=barcodes.end();it++){
642                                         if(it->first.length() > maxLength){
643                                                 maxLength = it->first.length();
644                                         }
645                                 }
646                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
647
648                         }else{ alignment = NULL; } 
649                         
650                         //can you find the barcode
651                         int minDiff = 1e6;
652                         int minCount = 1;
653                         int minGroup = -1;
654                         int minPos = 0;
655                         
656                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
657                                 string oligo = it->first;
658 //                              int length = oligo.length();
659                                 
660                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
661                                         success = 0;
662                                         break;
663                                 }
664                                 
665                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
666                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
667                                 oligo = alignment->getSeqAAln();
668                                 string temp = alignment->getSeqBAln();
669                 
670                                 int alnLength = oligo.length();
671                                 
672                                 for(int i=oligo.length()-1;i>=0;i--){
673                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
674                                 }
675                                 oligo = oligo.substr(0,alnLength);
676                                 temp = temp.substr(0,alnLength);
677 //                              cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,alnLength) << " raw aligned = " << temp << endl;                     
678                                 cout << seq.getName() << endl;
679                                 cout << temp << endl;
680                                 cout << oligo << endl;
681                                 cout << alnLength << endl;
682                                 cout << endl;
683                                 
684                                 int newStart=0;
685                                 int numDiff = countDiffs(oligo, temp);//, alnLength, newStart, bdiffs);
686                                 if(numDiff < minDiff){
687                                         minDiff = numDiff;
688                                         minCount = 1;
689                                         minGroup = it->second;
690                                         minPos = 0;
691                                         for(int i=0;i<alnLength;i++){
692                                                 if(temp[i] != '-'){
693                                                         minPos++;
694                                                 }
695                                         }
696                                 }
697                                 else if(numDiff == minDiff){
698                                         minCount++;
699                                 }
700
701                         }
702                         if(minDiff > bdiffs){   success = 0;    }
703                         else if(minCount > 1)   {       success = 0;    }
704                         else{
705                                 group = minGroup;
706                                 seq.setUnaligned("*" + rawSequence.substr(minPos));
707                                 success = 1;
708                         }
709                         
710                         if (alignment != NULL) {  delete alignment;  }
711                 }
712                 return success;
713                 
714         }
715         catch(exception& e) {
716                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
717                 exit(1);
718         }
719
720 }
721
722 //***************************************************************************************************************
723
724 bool TrimSeqsCommand::stripForward(Sequence& seq){
725         try {
726                 string rawSequence = seq.getUnaligned();
727                 bool success = 0;       //guilty until proven innocent
728                 
729                 for(int i=0;i<numFPrimers;i++){
730                         string oligo = forPrimer[i];
731                         
732                         if(rawSequence.length() < oligo.length()){
733                                 success = 0;
734                                 break;
735                         }
736                         
737                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
738                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
739                                 success = 1;
740                                 break;
741                         }
742                 }
743                 
744                 //if you found the primer or if you don't want to allow for diffs
745                 if ((pdiffs == 0) || (success == 1)) { return success;  }
746                 
747                 else { //try aligning and see if you can find it
748                         
749                         Alignment* alignment;
750                         if (numFPrimers > 0) {  alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+pdiffs+1));  } 
751                         else{ alignment = NULL; } 
752                         //can you find the primer
753                         for(int i=0;i<numFPrimers;i++){
754                                 string oligo = forPrimer[i];
755                                 int length = oligo.length();
756                         
757                                 if(rawSequence.length() < oligo.length()){      
758                                         success = 0;
759                                         break;
760                                 }
761                         
762                                 //resize if neccessary
763                                 if ((length+pdiffs+1) > alignment->getnRows()) { alignment->resize(length+pdiffs+1);    }
764                                 
765                                 //use needleman to align first primer.length()+numdiffs of sequence to each primer
766                                 alignment->align(oligo, rawSequence.substr(0,length+pdiffs));
767                                 oligo = alignment->getSeqAAln();
768                                 string temp = alignment->getSeqBAln();
769                         
770                                 int newStart = 0;
771 //                              if(compareDNASeq(oligo, temp, length, newStart, pdiffs)){
772 //                                      seq.setUnaligned(rawSequence.substr(newStart));
773 //                                      success = 1;
774 //                                      break;
775 //                              }
776                         }
777                         
778                         if (alignment != NULL) {  delete alignment;  }
779                 }
780                 
781                 return success;
782                 
783         }
784         catch(exception& e) {
785                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
786                 exit(1);
787         }
788 }
789
790 //***************************************************************************************************************
791
792 bool TrimSeqsCommand::stripReverse(Sequence& seq){
793         try {
794                 string rawSequence = seq.getUnaligned();
795                 bool success = 0;       //guilty until proven innocent
796                 
797                 for(int i=0;i<numRPrimers;i++){
798                         string oligo = revPrimer[i];
799                         
800                         if(rawSequence.length() < oligo.length()){
801                                 success = 0;
802                                 break;
803                         }
804                         
805                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
806                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
807                                 success = 1;
808                                 break;
809                         }
810                 }       
811                 return success;
812                 
813         }
814         catch(exception& e) {
815                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
816                 exit(1);
817         }
818 }
819
820 //***************************************************************************************************************
821
822 bool TrimSeqsCommand::cullLength(Sequence& seq){
823         try {
824         
825                 int length = seq.getNumBases();
826                 bool success = 0;       //guilty until proven innocent
827                 
828                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
829                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
830                 else                                                                                            {       success = 0;    }
831                 
832                 return success;
833         
834         }
835         catch(exception& e) {
836                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
837                 exit(1);
838         }
839         
840 }
841
842 //***************************************************************************************************************
843
844 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
845         try {
846                 int longHomoP = seq.getLongHomoPolymer();
847                 bool success = 0;       //guilty until proven innocent
848                 
849                 if(longHomoP <= maxHomoP){      success = 1;    }
850                 else                                    {       success = 0;    }
851                 
852                 return success;
853         }
854         catch(exception& e) {
855                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
856                 exit(1);
857         }
858         
859 }
860
861 //***************************************************************************************************************
862
863 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
864         try {
865                 int numNs = seq.getAmbigBases();
866                 bool success = 0;       //guilty until proven innocent
867                 
868                 if(numNs <= maxAmbig)   {       success = 1;    }
869                 else                                    {       success = 0;    }
870                 
871                 return success;
872         }
873         catch(exception& e) {
874                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
875                 exit(1);
876         }
877         
878 }
879
880 //***************************************************************************************************************
881
882 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
883         try {
884                 bool success = 1;
885                 int length = oligo.length();
886                 
887                 for(int i=0;i<length;i++){
888                         
889                         if(oligo[i] != seq[i]){
890                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
891                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
892                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
893                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
894                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
895                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
896                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
897                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
898                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
899                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
900                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
901                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
902                                 
903                                 if(success == 0)        {       break;   }
904                         }
905                         else{
906                                 success = 1;
907                         }
908                 }
909                 
910                 return success;
911         }
912         catch(exception& e) {
913                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
914                 exit(1);
915         }
916
917 }
918 //***************************************************************************************************************
919
920 int TrimSeqsCommand::countDiffs(string oligo, string seq){//, int numBases, int& end, int diffs){
921         try {
922 //              bool success = 1;
923                 int length = oligo.length();
924 //              end = numBases;
925 //              int countBases = 0;
926                 int countDiffs = 0;
927                 
928         
929 //              if (length != 0) {
930 //                      if ((oligo[0] == '-') || (oligo[0] == '.')) {  return 1e6;  } //no gaps allowed at beginning
931 //              }
932                 
933                 for(int i=0;i<length;i++){
934                         
935 //                      if ((oligo[i] != '-') && (oligo[i] != '.'))  { countBases++; } 
936                                         
937                         if(oligo[i] != seq[i]){
938                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
939                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
940                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
941                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
942                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
943                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
944                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
945                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
946                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
947                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
948                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
949                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
950                                 
951 //                              if(countDiffs > diffs)  {       break;   }
952                         }
953 //                      else{
954 //                              success = 1;
955 //                      }
956                         
957 //                      if (countBases >= numBases) { end = countBases; break; } //stop checking after end of barcode or primer
958                 }
959                 //if it's a success we want to check for total diffs in driver, so save it.
960 //              if (success == 1) {  currentSeqsTdiffs = countDiffs; }
961                 
962                 return countDiffs;
963         }
964         catch(exception& e) {
965                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
966                 exit(1);
967         }
968
969 }
970 //***************************************************************************************************************
971
972 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
973         try {
974                 string rawSequence = seq.getUnaligned();
975                 int seqLength;  // = rawSequence.length();
976                 string name, temp, temp2;
977                 
978                 qFile >> name >> temp;
979         
980                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
981                 convert(temp, seqLength); //converts string to int
982         
983                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
984                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
985                 
986                 int score;
987                 int end = seqLength;
988                 
989                 for(int i=0;i<seqLength;i++){
990                         qFile >> score;
991                         
992                         if(score <= qThreshold){
993                                 end = i;
994                                 break;
995                         }
996                 }
997                 for(int i=end+1;i<seqLength;i++){
998                         qFile >> score;
999                 }
1000                 
1001                 seq.setUnaligned(rawSequence.substr(0,end));
1002                 
1003                 return 1;
1004         }
1005         catch(exception& e) {
1006                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1007                 exit(1);
1008         }
1009 }
1010
1011 //***************************************************************************************************************
1012
1013 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1014         try {
1015                 string rawSequence = seq.getUnaligned();
1016                 int seqLength = seq.getNumBases();
1017                 bool success = 0;       //guilty until proven innocent
1018                 string name;
1019                 
1020                 qFile >> name;
1021                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1022                 
1023                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1024                 
1025                 float score;    
1026                 float average = 0;
1027                 
1028                 for(int i=0;i<seqLength;i++){
1029                         qFile >> score;
1030                         average += score;
1031                 }
1032                 average /= seqLength;
1033
1034                 if(average >= qAverage) {       success = 1;    }
1035                 else                                    {       success = 0;    }
1036                 
1037                 return success;
1038         }
1039         catch(exception& e) {
1040                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1041                 exit(1);
1042         }
1043 }
1044
1045 //***************************************************************************************************************