]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
changed how we count sequences in a fastafile to allow for '>' in sequence names
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         int numSeqs;
228                                         openInputFile(fastaFile, inFASTA);
229                                         getNumSeqs(inFASTA, numSeqs);
230                                         inFASTA.close();
231                                         
232                                         lines.push_back(new linePair(0, numSeqs));
233                                         
234                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
235                                         
236                                         for (int j = 0; j < fastaFileNames.size(); j++) {
237                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
238                                         }
239
240                                 }else{
241                                         setLines(fastaFile, lines);     
242                                         if(qFileName == "")     {       qLines = lines; }       
243                                                                 
244                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
245                                         
246                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
247                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
248                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
249                                         for (int j = 0; j < fastaFileNames.size(); j++) {
250                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
251                                         }
252                                         //append files
253                                         for(int i=1;i<processors;i++){
254                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
255                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
256                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
257                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
258                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
259                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
260                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
261                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
262                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
263                                                 }
264                                         }
265                                 }
266                                 
267                                 if (m->control_pressed) {  return 0; }
268                 #else
269                                 ifstream inFASTA;
270                                 int numSeqs;
271                                 openInputFile(fastaFile, inFASTA);
272                                 getNumSeqs(inFASTA, numSeqs);
273                                 inFASTA.close();
274                                 
275                                 lines.push_back(new linePair(0, numSeqs));
276                                 
277                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
278                                 
279                                 if (m->control_pressed) {  return 0; }
280                 #endif
281                                                 
282                                                                                 
283                 for(int i=0;i<fastaFileNames.size();i++){
284                         ifstream inFASTA;
285                         string seqName;
286                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
287                         ofstream outGroups;
288                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
289                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
290                         
291                         while(!inFASTA.eof()){
292                                 if(inFASTA.get() == '>'){
293                                         inFASTA >> seqName;
294                                         outGroups << seqName << '\t' << groupVector[i] << endl;
295                                 }
296                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
297                         }
298                         outGroups.close();
299                         inFASTA.close();
300                 }
301                 
302                 if (m->control_pressed) { 
303                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
304                         return 0;
305                 }
306
307                 m->mothurOutEndLine();
308                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
309                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
310                 m->mothurOutEndLine();
311                 
312                 return 0;       
313                         
314         }
315         catch(exception& e) {
316                 m->errorOut(e, "TrimSeqsCommand", "execute");
317                 exit(1);
318         }
319 }
320                 
321 /**************************************************************************************/
322 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
323         try {
324                 
325                 ofstream outFASTA;
326                 int able = openOutputFile(trimFile, outFASTA);
327                 
328                 ofstream scrapFASTA;
329                 openOutputFile(scrapFile, scrapFASTA);
330                 
331                 ofstream outGroups;
332                 vector<ofstream*> fastaFileNames;
333                 if (oligoFile != "") {          
334                         openOutputFile(groupFile, outGroups);   
335                         for (int i = 0; i < fastaNames.size(); i++) {
336                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
337                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
338                         #else
339                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate));                      
340                         #endif
341                         }
342                 }
343                 
344                 ifstream inFASTA;
345                 openInputFile(filename, inFASTA);
346                 
347                 ifstream qFile;
348                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
349                 
350                 qFile.seekg(qline->start);
351                 inFASTA.seekg(line->start);
352                 
353                 for(int i=0;i<line->num;i++){
354                                 
355                         if (m->control_pressed) { 
356                                 inFASTA.close(); 
357                                 outFASTA.close();
358                                 scrapFASTA.close();
359                                 if (oligoFile != "") {   outGroups.close();   }
360                                 if(qFileName != "")     {       qFile.close();  }
361                                 for(int i=0;i<fastaFileNames.size();i++){
362                                         fastaFileNames[i]->close();
363                                         delete fastaFileNames[i];
364                                 }       
365                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
366                                 return 0;
367                         }
368                         
369                         int success = 1;
370                         
371                         Sequence currSeq(inFASTA);
372
373                         string origSeq = currSeq.getUnaligned();
374                         if (origSeq != "") {
375                                 int group;
376                                 string trashCode = "";
377                                 int currentSeqsDiffs = 0;
378                                 
379                                 if(qFileName != ""){
380                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
381                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
382                                         
383                                         if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
384                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
385                                         }
386
387                                         if(!success)                    {       trashCode += 'q';                                                               }
388                                 }
389                         
390                                 if(barcodes.size() != 0){
391                                         success = stripBarcode(currSeq, group);
392                                         if(success > bdiffs){   trashCode += 'b';       }
393                                         else{ currentSeqsDiffs += success;  }
394                                 }
395
396                                 if(numFPrimers != 0){
397                                         success = stripForward(currSeq);
398                                         if(success > pdiffs){   trashCode += 'f';       }
399                                         else{ currentSeqsDiffs += success;  }
400                                 }
401                                 
402                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
403
404                                 if(numRPrimers != 0){
405                                         success = stripReverse(currSeq);
406                                         if(!success){   trashCode += 'r';       }
407                                 }
408                 
409                                 if(minLength > 0 || maxLength > 0){
410                                         success = cullLength(currSeq);
411                                         if(!success){   trashCode += 'l'; }
412                                 }
413                                 if(maxHomoP > 0){
414                                         success = cullHomoP(currSeq);
415                                         if(!success){   trashCode += 'h';       }
416                                 }
417                                 if(maxAmbig != -1){
418                                         success = cullAmbigs(currSeq);
419                                         if(!success){   trashCode += 'n';       }
420                                 }
421                                 
422                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
423                                 
424                                 if(trashCode.length() == 0){
425                                         currSeq.setAligned(currSeq.getUnaligned());
426                                         currSeq.printSequence(outFASTA);
427                                         if(barcodes.size() != 0){
428                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
429                                                 
430                                                 if(allFiles){
431                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
432                                                 }
433                                         }
434                                 }
435                                 else{
436                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
437                                         currSeq.setUnaligned(origSeq);
438                                         currSeq.setAligned(origSeq);
439                                         currSeq.printSequence(scrapFASTA);
440                                 }
441                         }
442                         gobble(inFASTA);
443                 }
444                 
445                 inFASTA.close();
446                 outFASTA.close();
447                 scrapFASTA.close();
448                 if (oligoFile != "") {   outGroups.close();   }
449                 if(qFileName != "")     {       qFile.close();  }
450                 
451                 for(int i=0;i<fastaFileNames.size();i++){
452                         fastaFileNames[i]->close();
453                         delete fastaFileNames[i];
454                 }               
455                 
456                 return 0;
457         }
458         catch(exception& e) {
459                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
460                 exit(1);
461         }
462 }
463 /**************************************************************************************************/
464 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
465         try {
466 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
467                 int process = 0;
468                 int exitCommand = 1;
469                 processIDS.clear();
470                 
471                 //loop through and create all the processes you want
472                 while (process != processors) {
473                         int pid = fork();
474                         
475                         if (pid > 0) {
476                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
477                                 process++;
478                         }else if (pid == 0){
479                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
480                                 exit(0);
481                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
482                 }
483                 
484                 //force parent to wait until all the processes are done
485                 for (int i=0;i<processors;i++) { 
486                         int temp = processIDS[i];
487                         wait(&temp);
488                 }
489                 
490                 return exitCommand;
491 #endif          
492         }
493         catch(exception& e) {
494                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
495                 exit(1);
496         }
497 }
498 /**************************************************************************************************/
499
500 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
501         try {
502                 
503                 lines.clear();
504                 
505                 vector<long int> positions;
506                 
507                 ifstream inFASTA;
508                 openInputFile(filename, inFASTA);
509                         
510                 string input;
511                 while(!inFASTA.eof()){  
512                         input = getline(inFASTA);
513
514                         if (input.length() != 0) {
515                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
516                         }
517                 }
518                 inFASTA.close();
519                 
520                 int numFastaSeqs = positions.size();
521         
522                 FILE * pFile;
523                 long size;
524                 
525                 //get num bytes in file
526                 pFile = fopen (filename.c_str(),"rb");
527                 if (pFile==NULL) perror ("Error opening file");
528                 else{
529                         fseek (pFile, 0, SEEK_END);
530                         size=ftell (pFile);
531                         fclose (pFile);
532                 }
533                 
534                 int numSeqsPerProcessor = numFastaSeqs / processors;
535                 
536                 for (int i = 0; i < processors; i++) {
537
538                         long int startPos = positions[ i * numSeqsPerProcessor ];
539                         if(i == processors - 1){
540                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
541                         }else{  
542                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
543                         }
544                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
545                 }
546                 
547                 return numFastaSeqs;
548         }
549         catch(exception& e) {
550                 m->errorOut(e, "TrimSeqsCommand", "setLines");
551                 exit(1);
552         }
553 }
554 //***************************************************************************************************************
555
556 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
557         try {
558                 ifstream inOligos;
559                 openInputFile(oligoFile, inOligos);
560                 
561                 ofstream test;
562                 
563                 string type, oligo, group;
564                 int index=0;
565                 
566                 while(!inOligos.eof()){
567                         inOligos >> type;
568                         
569                         if(type[0] == '#'){
570                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
571                         }
572                         else{
573                                 inOligos >> oligo;
574                                 
575                                 for(int i=0;i<oligo.length();i++){
576                                         oligo[i] = toupper(oligo[i]);
577                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
578                                 }
579                                 
580                                 if(type == "forward"){
581                                         forPrimer.push_back(oligo);
582                                 }
583                                 else if(type == "reverse"){
584                                         Sequence oligoRC("reverse", oligo);
585                                         oligoRC.reverseComplement();
586                                         revPrimer.push_back(oligoRC.getUnaligned());
587                                 }
588                                 else if(type == "barcode"){
589                                         inOligos >> group;
590                                         barcodes[oligo]=index++;
591                                         groupVector.push_back(group);
592                                         
593                                         if(allFiles){
594                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
595                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
596                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
597                                         }
598                                 }
599                         }
600                 }
601                 
602                 inOligos.close();
603                 
604                 numFPrimers = forPrimer.size();
605                 numRPrimers = revPrimer.size();
606                 
607         }
608         catch(exception& e) {
609                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
610                 exit(1);
611         }
612 }
613 //***************************************************************************************************************
614
615 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
616         try {
617                 
618                 string rawSequence = seq.getUnaligned();
619                 int success = bdiffs + 1;       //guilty until proven innocent
620                 
621                 //can you find the barcode
622                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
623                         string oligo = it->first;
624                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
625                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
626                                 break;  
627                         }
628                         
629                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
630                                 group = it->second;
631                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
632                                 success = 0;
633                                 break;
634                         }
635                 }
636                 
637                 //if you found the barcode or if you don't want to allow for diffs
638 //              cout << success;
639                 if ((bdiffs == 0) || (success == 0)) { return success;  }
640                 
641                 else { //try aligning and see if you can find it
642 //                      cout << endl;
643
644                         int maxLength = 0;
645
646                         Alignment* alignment;
647                         if (barcodes.size() > 0) {
648                                 map<string,int>::iterator it=barcodes.begin();
649
650                                 for(it;it!=barcodes.end();it++){
651                                         if(it->first.length() > maxLength){
652                                                 maxLength = it->first.length();
653                                         }
654                                 }
655                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
656
657                         }else{ alignment = NULL; } 
658                         
659                         //can you find the barcode
660                         int minDiff = 1e6;
661                         int minCount = 1;
662                         int minGroup = -1;
663                         int minPos = 0;
664                         
665                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
666                                 string oligo = it->first;
667 //                              int length = oligo.length();
668                                 
669                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
670                                         success = bdiffs + 10;
671                                         break;
672                                 }
673                                 
674                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
675                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
676                                 oligo = alignment->getSeqAAln();
677                                 string temp = alignment->getSeqBAln();
678                 
679                                 int alnLength = oligo.length();
680                                 
681                                 for(int i=oligo.length()-1;i>=0;i--){
682                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
683                                 }
684                                 oligo = oligo.substr(0,alnLength);
685                                 temp = temp.substr(0,alnLength);
686                                 
687                                 int newStart=0;
688                                 int numDiff = countDiffs(oligo, temp);
689                                 if(numDiff < minDiff){
690                                         minDiff = numDiff;
691                                         minCount = 1;
692                                         minGroup = it->second;
693                                         minPos = 0;
694                                         for(int i=0;i<alnLength;i++){
695                                                 if(temp[i] != '-'){
696                                                         minPos++;
697                                                 }
698                                         }
699                                 }
700                                 else if(numDiff == minDiff){
701                                         minCount++;
702                                 }
703
704                         }
705
706                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
707                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
708                         else{                                                                                                   //use the best match
709                                 group = minGroup;
710                                 seq.setUnaligned(rawSequence.substr(minPos));
711                                 success = minDiff;
712                         }
713                         
714                         if (alignment != NULL) {  delete alignment;  }
715                         
716                 }
717 //              cout << success << endl;
718                 
719                 return success;
720                 
721         }
722         catch(exception& e) {
723                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
724                 exit(1);
725         }
726
727 }
728
729 //***************************************************************************************************************
730
731 int TrimSeqsCommand::stripForward(Sequence& seq){
732         try {
733                 string rawSequence = seq.getUnaligned();
734                 int success = pdiffs + 1;       //guilty until proven innocent
735                 
736                 //can you find the primer
737                 for(int i=0;i<numFPrimers;i++){
738                         string oligo = forPrimer[i];
739
740                         if(rawSequence.length() < oligo.length()){      
741                                 success = pdiffs + 1;
742                                 break;
743                         }
744                         
745                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
746                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
747                                 success = 0;
748                                 break;
749                         }
750                 }
751                 
752                 //if you found the barcode or if you don't want to allow for diffs
753 //              cout << success;
754                 if ((pdiffs == 0) || (success == 0)) { return success;  }
755                 
756                 else { //try aligning and see if you can find it
757 //                      cout << endl;
758
759                         int maxLength = 0;
760
761                         Alignment* alignment;
762                         if (numFPrimers > 0) {
763
764                                 for(int i=0;i<numFPrimers;i++){
765                                         if(forPrimer[i].length() > maxLength){
766                                                 maxLength = forPrimer[i].length();
767                                         }
768                                 }
769                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1));  
770
771                         }else{ alignment = NULL; } 
772                         
773                         //can you find the barcode
774                         int minDiff = 1e6;
775                         int minCount = 1;
776                         int minPos = 0;
777                         
778                         for(int i=0;i<numFPrimers;i++){
779                                 string oligo = forPrimer[i];
780                                 
781                                 if(rawSequence.length() < maxLength){   
782                                         success = pdiffs + 100;
783                                         break;
784                                 }
785                                 
786                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
787                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
788                                 oligo = alignment->getSeqAAln();
789                                 string temp = alignment->getSeqBAln();
790                 
791                                 int alnLength = oligo.length();
792                                 
793                                 for(int i=oligo.length()-1;i>=0;i--){
794                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
795                                 }
796                                 oligo = oligo.substr(0,alnLength);
797                                 temp = temp.substr(0,alnLength);
798
799                                 int newStart=0;
800                                 int numDiff = countDiffs(oligo, temp);
801                                 if(numDiff < minDiff){
802                                         minDiff = numDiff;
803                                         minCount = 1;
804                                         minPos = 0;
805                                         for(int i=0;i<alnLength;i++){
806                                                 if(temp[i] != '-'){
807                                                         minPos++;
808                                                 }
809                                         }
810                                 }
811                                 else if(numDiff == minDiff){
812                                         minCount++;
813                                 }
814
815                         }
816                         if(minDiff > pdiffs)    {       success =  minDiff;             }
817                         else if(minCount > 1)   {       success =  pdiffs + 10; }
818                         else{
819                                 seq.setUnaligned(rawSequence.substr(minPos));
820                                 success = minDiff;
821                         }
822                         
823                         if (alignment != NULL) {  delete alignment;  }
824                         
825                 }
826                 return success;
827
828         }
829         catch(exception& e) {
830                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
831                 exit(1);
832         }
833 }
834
835 //***************************************************************************************************************
836
837 bool TrimSeqsCommand::stripReverse(Sequence& seq){
838         try {
839                 string rawSequence = seq.getUnaligned();
840                 bool success = 0;       //guilty until proven innocent
841                 
842                 for(int i=0;i<numRPrimers;i++){
843                         string oligo = revPrimer[i];
844                         
845                         if(rawSequence.length() < oligo.length()){
846                                 success = 0;
847                                 break;
848                         }
849                         
850                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
851                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
852                                 success = 1;
853                                 break;
854                         }
855                 }       
856                 return success;
857                 
858         }
859         catch(exception& e) {
860                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
861                 exit(1);
862         }
863 }
864
865 //***************************************************************************************************************
866
867 bool TrimSeqsCommand::cullLength(Sequence& seq){
868         try {
869         
870                 int length = seq.getNumBases();
871                 bool success = 0;       //guilty until proven innocent
872                 
873                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
874                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
875                 else                                                                                            {       success = 0;    }
876                 
877                 return success;
878         
879         }
880         catch(exception& e) {
881                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
882                 exit(1);
883         }
884         
885 }
886
887 //***************************************************************************************************************
888
889 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
890         try {
891                 int longHomoP = seq.getLongHomoPolymer();
892                 bool success = 0;       //guilty until proven innocent
893                 
894                 if(longHomoP <= maxHomoP){      success = 1;    }
895                 else                                    {       success = 0;    }
896                 
897                 return success;
898         }
899         catch(exception& e) {
900                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
901                 exit(1);
902         }
903         
904 }
905
906 //***************************************************************************************************************
907
908 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
909         try {
910                 int numNs = seq.getAmbigBases();
911                 bool success = 0;       //guilty until proven innocent
912                 
913                 if(numNs <= maxAmbig)   {       success = 1;    }
914                 else                                    {       success = 0;    }
915                 
916                 return success;
917         }
918         catch(exception& e) {
919                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
920                 exit(1);
921         }
922         
923 }
924
925 //***************************************************************************************************************
926
927 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
928         try {
929                 bool success = 1;
930                 int length = oligo.length();
931                 
932                 for(int i=0;i<length;i++){
933                         
934                         if(oligo[i] != seq[i]){
935                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
936                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
937                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
938                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
939                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
940                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
941                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
942                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
943                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
944                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
945                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
946                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
947                                 
948                                 if(success == 0)        {       break;   }
949                         }
950                         else{
951                                 success = 1;
952                         }
953                 }
954                 
955                 return success;
956         }
957         catch(exception& e) {
958                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
959                 exit(1);
960         }
961
962 }
963 //***************************************************************************************************************
964
965 int TrimSeqsCommand::countDiffs(string oligo, string seq){
966         try {
967
968                 int length = oligo.length();
969                 int countDiffs = 0;
970                 
971                 for(int i=0;i<length;i++){
972                                                                 
973                         if(oligo[i] != seq[i]){
974                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
975                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
976                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
977                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
978                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
979                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
980                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
981                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
982                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
983                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
984                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
985                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
986                                 
987                         }
988                 }
989                 
990                 return countDiffs;
991         }
992         catch(exception& e) {
993                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
994                 exit(1);
995         }
996
997 }
998 //***************************************************************************************************************
999
1000 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
1001         try {
1002 //              string rawSequence = seq.getUnaligned();
1003 //              int seqLength;  // = rawSequence.length();
1004 //              string name, temp, temp2;
1005 //              
1006 //              qFile >> name;
1007 //              
1008 //              //get rest of line
1009 //              temp = "";
1010 //              while (!qFile.eof())    {       
1011 //                      char c = qFile.get(); 
1012 //                      if (c == 10 || c == 13){        break;  }       
1013 //                      else { temp += c; }
1014 //              } 
1015 //      
1016 //              int pos = temp.find("length");
1017 //              if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine();  seqLength = 0;  }
1018 //              else {
1019 //                      string tempLength = temp.substr(pos);
1020 //                      istringstream iss (tempLength,istringstream::in);
1021 //                      iss >> temp;
1022 //              }
1023 //              
1024 //              splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1025 //              convert(temp, seqLength); //converts string to int
1026 //      
1027 //              if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1028                 
1029                 string rawSequence = seq.getUnaligned();
1030                 int seqLength = seq.getNumBases();
1031                 bool success = 0;       //guilty until proven innocent
1032                 string name;
1033                 
1034                 qFile >> name;
1035                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1036                 
1037                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1038                 
1039                 int score;
1040                 int end = seqLength;
1041                 
1042                 for(int i=0;i<seqLength;i++){
1043                         qFile >> score;
1044                         
1045                         if(score < qThreshold){
1046                                 end = i;
1047                                 break;
1048                         }
1049                 }
1050                 for(int i=end+1;i<seqLength;i++){
1051                         qFile >> score;
1052                 }
1053                 
1054                 seq.setUnaligned(rawSequence.substr(0,end));
1055                 
1056                 return 1;
1057         }
1058         catch(exception& e) {
1059                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1060                 exit(1);
1061         }
1062 }
1063
1064 //***************************************************************************************************************
1065
1066 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1067         try {
1068                 string rawSequence = seq.getUnaligned();
1069                 int seqLength = seq.getNumBases();
1070                 bool success = 0;       //guilty until proven innocent
1071                 string name;
1072                 
1073                 qFile >> name;
1074                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1075                 
1076                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1077                 
1078                 float score;    
1079                 float average = 0;
1080                 
1081                 for(int i=0;i<seqLength;i++){
1082                         qFile >> score;
1083                         average += score;
1084                 }
1085                 average /= seqLength;
1086
1087                 if(average >= qAverage) {       success = 1;    }
1088                 else                                    {       success = 0;    }
1089                 
1090                 return success;
1091         }
1092         catch(exception& e) {
1093                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1094                 exit(1);
1095         }
1096 }
1097
1098 //***************************************************************************************************************