]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
71e5c9277af61fef9884f42b2c8e9d54a2796e12
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         openInputFile(fastaFile, inFASTA);
228                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
229                                         inFASTA.close();
230                                         
231                                         lines.push_back(new linePair(0, numSeqs));
232                                         
233                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
234                                         
235                                         for (int j = 0; j < fastaFileNames.size(); j++) {
236                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
237                                         }
238
239                                 }else{
240                                         setLines(fastaFile, lines);     
241                                         if(qFileName == "")     {       qLines = lines; }       
242                                                                 
243                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
244                                         
245                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
246                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
247                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
248                                         for (int j = 0; j < fastaFileNames.size(); j++) {
249                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
250                                         }
251                                         //append files
252                                         for(int i=1;i<processors;i++){
253                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
254                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
255                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
256                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
257                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
258                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
259                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
260                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
261                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
262                                                 }
263                                         }
264                                 }
265                                 
266                                 if (m->control_pressed) {  return 0; }
267                 #else
268                                 ifstream inFASTA;
269                                 openInputFile(fastaFile, inFASTA);
270                                 int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
271                                 inFASTA.close();
272                                 
273                                 lines.push_back(new linePair(0, numSeqs));
274                                 
275                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
276                                 
277                                 if (m->control_pressed) {  return 0; }
278                 #endif
279                                                 
280                                                                                 
281                 for(int i=0;i<fastaFileNames.size();i++){
282                         ifstream inFASTA;
283                         string seqName;
284                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
285                         ofstream outGroups;
286                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
287                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
288                         
289                         while(!inFASTA.eof()){
290                                 if(inFASTA.get() == '>'){
291                                         inFASTA >> seqName;
292                                         outGroups << seqName << '\t' << groupVector[i] << endl;
293                                 }
294                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
295                         }
296                         outGroups.close();
297                         inFASTA.close();
298                 }
299                 
300                 if (m->control_pressed) { 
301                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
302                         return 0;
303                 }
304
305                 m->mothurOutEndLine();
306                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
307                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
308                 m->mothurOutEndLine();
309                 
310                 return 0;       
311                         
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "TrimSeqsCommand", "execute");
315                 exit(1);
316         }
317 }
318                 
319 /**************************************************************************************/
320 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
321         try {
322                 
323                 ofstream outFASTA;
324                 int able = openOutputFile(trimFile, outFASTA);
325                 
326                 ofstream scrapFASTA;
327                 openOutputFile(scrapFile, scrapFASTA);
328                 
329                 ofstream outGroups;
330                 vector<ofstream*> fastaFileNames;
331                 if (oligoFile != "") {          
332                         openOutputFile(groupFile, outGroups);   
333                         for (int i = 0; i < fastaNames.size(); i++) {
334                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
335                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
336                         #else
337                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate));                      
338                         #endif
339                         }
340                 }
341                 
342                 ifstream inFASTA;
343                 openInputFile(filename, inFASTA);
344                 
345                 ifstream qFile;
346                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
347                 
348                 qFile.seekg(qline->start);
349                 inFASTA.seekg(line->start);
350                 
351                 for(int i=0;i<line->num;i++){
352                                 
353                         if (m->control_pressed) { 
354                                 inFASTA.close(); 
355                                 outFASTA.close();
356                                 scrapFASTA.close();
357                                 if (oligoFile != "") {   outGroups.close();   }
358                                 if(qFileName != "")     {       qFile.close();  }
359                                 for(int i=0;i<fastaFileNames.size();i++){
360                                         fastaFileNames[i]->close();
361                                         delete fastaFileNames[i];
362                                 }       
363                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
364                                 return 0;
365                         }
366                         
367                         int success = 1;
368                         
369                         Sequence currSeq(inFASTA);
370
371                         string origSeq = currSeq.getUnaligned();
372                         if (origSeq != "") {
373                                 int group;
374                                 string trashCode = "";
375                                 int currentSeqsDiffs = 0;
376                                 
377                                 if(qFileName != ""){
378                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
379                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
380                                         
381                                         if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { 
382                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
383                                         }
384
385                                         if(!success)                    {       trashCode += 'q';                                                               }
386                                 }
387                         
388                                 if(barcodes.size() != 0){
389                                         success = stripBarcode(currSeq, group);
390                                         if(success > bdiffs){   trashCode += 'b';       }
391                                         else{ currentSeqsDiffs += success;  }
392                                 }
393
394                                 if(numFPrimers != 0){
395                                         success = stripForward(currSeq);
396                                         if(success > pdiffs){   trashCode += 'f';       }
397                                         else{ currentSeqsDiffs += success;  }
398                                 }
399                                 
400                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
401
402                                 if(numRPrimers != 0){
403                                         success = stripReverse(currSeq);
404                                         if(!success){   trashCode += 'r';       }
405                                 }
406                 
407                                 if(minLength > 0 || maxLength > 0){
408                                         success = cullLength(currSeq);
409                                         if(!success){   trashCode += 'l'; }
410                                 }
411                                 if(maxHomoP > 0){
412                                         success = cullHomoP(currSeq);
413                                         if(!success){   trashCode += 'h';       }
414                                 }
415                                 if(maxAmbig != -1){
416                                         success = cullAmbigs(currSeq);
417                                         if(!success){   trashCode += 'n';       }
418                                 }
419                                 
420                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
421                                 
422                                 if(trashCode.length() == 0){
423                                         currSeq.setAligned(currSeq.getUnaligned());
424                                         currSeq.printSequence(outFASTA);
425                                         if(barcodes.size() != 0){
426                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
427                                                 
428                                                 if(allFiles){
429                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
430                                                 }
431                                         }
432                                 }
433                                 else{
434                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
435                                         currSeq.setUnaligned(origSeq);
436                                         currSeq.setAligned(origSeq);
437                                         currSeq.printSequence(scrapFASTA);
438                                 }
439                         }
440                         gobble(inFASTA);
441                 }
442                 
443                 inFASTA.close();
444                 outFASTA.close();
445                 scrapFASTA.close();
446                 if (oligoFile != "") {   outGroups.close();   }
447                 if(qFileName != "")     {       qFile.close();  }
448                 
449                 for(int i=0;i<fastaFileNames.size();i++){
450                         fastaFileNames[i]->close();
451                         delete fastaFileNames[i];
452                 }               
453                 
454                 return 0;
455         }
456         catch(exception& e) {
457                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
458                 exit(1);
459         }
460 }
461 /**************************************************************************************************/
462 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
463         try {
464 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
465                 int process = 0;
466                 int exitCommand = 1;
467                 processIDS.clear();
468                 
469                 //loop through and create all the processes you want
470                 while (process != processors) {
471                         int pid = fork();
472                         
473                         if (pid > 0) {
474                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
475                                 process++;
476                         }else if (pid == 0){
477                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
478                                 exit(0);
479                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
480                 }
481                 
482                 //force parent to wait until all the processes are done
483                 for (int i=0;i<processors;i++) { 
484                         int temp = processIDS[i];
485                         wait(&temp);
486                 }
487                 
488                 return exitCommand;
489 #endif          
490         }
491         catch(exception& e) {
492                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
493                 exit(1);
494         }
495 }
496 /**************************************************************************************************/
497
498 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
499         try {
500                 
501                 lines.clear();
502                 
503                 vector<long int> positions;
504                 
505                 ifstream inFASTA;
506                 openInputFile(filename, inFASTA);
507                         
508                 string input;
509                 while(!inFASTA.eof()){  
510                         input = getline(inFASTA);
511
512                         if (input.length() != 0) {
513                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
514                         }
515                 }
516                 inFASTA.close();
517                 
518                 int numFastaSeqs = positions.size();
519         
520                 FILE * pFile;
521                 long size;
522                 
523                 //get num bytes in file
524                 pFile = fopen (filename.c_str(),"rb");
525                 if (pFile==NULL) perror ("Error opening file");
526                 else{
527                         fseek (pFile, 0, SEEK_END);
528                         size=ftell (pFile);
529                         fclose (pFile);
530                 }
531                 
532                 int numSeqsPerProcessor = numFastaSeqs / processors;
533                 
534                 for (int i = 0; i < processors; i++) {
535
536                         long int startPos = positions[ i * numSeqsPerProcessor ];
537                         if(i == processors - 1){
538                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
539                         }else{  
540                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
541                         }
542                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
543                 }
544                 
545                 return numFastaSeqs;
546         }
547         catch(exception& e) {
548                 m->errorOut(e, "TrimSeqsCommand", "setLines");
549                 exit(1);
550         }
551 }
552 //***************************************************************************************************************
553
554 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
555         try {
556                 ifstream inOligos;
557                 openInputFile(oligoFile, inOligos);
558                 
559                 ofstream test;
560                 
561                 string type, oligo, group;
562                 int index=0;
563                 
564                 while(!inOligos.eof()){
565                         inOligos >> type;
566                         
567                         if(type[0] == '#'){
568                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
569                         }
570                         else{
571                                 inOligos >> oligo;
572                                 
573                                 for(int i=0;i<oligo.length();i++){
574                                         oligo[i] = toupper(oligo[i]);
575                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
576                                 }
577                                 
578                                 if(type == "forward"){
579                                         forPrimer.push_back(oligo);
580                                 }
581                                 else if(type == "reverse"){
582                                         Sequence oligoRC("reverse", oligo);
583                                         oligoRC.reverseComplement();
584                                         revPrimer.push_back(oligoRC.getUnaligned());
585                                 }
586                                 else if(type == "barcode"){
587                                         inOligos >> group;
588                                         barcodes[oligo]=index++;
589                                         groupVector.push_back(group);
590                                         
591                                         if(allFiles){
592                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
593                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
594                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
595                                         }
596                                 }
597                         }
598                 }
599                 
600                 inOligos.close();
601                 
602                 numFPrimers = forPrimer.size();
603                 numRPrimers = revPrimer.size();
604                 
605         }
606         catch(exception& e) {
607                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
608                 exit(1);
609         }
610 }
611 //***************************************************************************************************************
612
613 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
614         try {
615                 
616                 string rawSequence = seq.getUnaligned();
617                 int success = bdiffs + 1;       //guilty until proven innocent
618                 
619                 //can you find the barcode
620                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
621                         string oligo = it->first;
622                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
623                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
624                                 break;  
625                         }
626                         
627                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
628                                 group = it->second;
629                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
630                                 success = 0;
631                                 break;
632                         }
633                 }
634                 
635                 //if you found the barcode or if you don't want to allow for diffs
636 //              cout << success;
637                 if ((bdiffs == 0) || (success == 0)) { return success;  }
638                 
639                 else { //try aligning and see if you can find it
640 //                      cout << endl;
641
642                         int maxLength = 0;
643
644                         Alignment* alignment;
645                         if (barcodes.size() > 0) {
646                                 map<string,int>::iterator it=barcodes.begin();
647
648                                 for(it;it!=barcodes.end();it++){
649                                         if(it->first.length() > maxLength){
650                                                 maxLength = it->first.length();
651                                         }
652                                 }
653                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
654
655                         }else{ alignment = NULL; } 
656                         
657                         //can you find the barcode
658                         int minDiff = 1e6;
659                         int minCount = 1;
660                         int minGroup = -1;
661                         int minPos = 0;
662                         
663                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
664                                 string oligo = it->first;
665 //                              int length = oligo.length();
666                                 
667                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
668                                         success = bdiffs + 10;
669                                         break;
670                                 }
671                                 
672                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
673                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
674                                 oligo = alignment->getSeqAAln();
675                                 string temp = alignment->getSeqBAln();
676                 
677                                 int alnLength = oligo.length();
678                                 
679                                 for(int i=oligo.length()-1;i>=0;i--){
680                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
681                                 }
682                                 oligo = oligo.substr(0,alnLength);
683                                 temp = temp.substr(0,alnLength);
684                                 
685                                 int newStart=0;
686                                 int numDiff = countDiffs(oligo, temp);
687                                 if(numDiff < minDiff){
688                                         minDiff = numDiff;
689                                         minCount = 1;
690                                         minGroup = it->second;
691                                         minPos = 0;
692                                         for(int i=0;i<alnLength;i++){
693                                                 if(temp[i] != '-'){
694                                                         minPos++;
695                                                 }
696                                         }
697                                 }
698                                 else if(numDiff == minDiff){
699                                         minCount++;
700                                 }
701
702                         }
703
704                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
705                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
706                         else{                                                                                                   //use the best match
707                                 group = minGroup;
708                                 seq.setUnaligned(rawSequence.substr(minPos));
709                                 success = minDiff;
710                         }
711                         
712                         if (alignment != NULL) {  delete alignment;  }
713                         
714                 }
715 //              cout << success << endl;
716                 
717                 return success;
718                 
719         }
720         catch(exception& e) {
721                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
722                 exit(1);
723         }
724
725 }
726
727 //***************************************************************************************************************
728
729 int TrimSeqsCommand::stripForward(Sequence& seq){
730         try {
731                 string rawSequence = seq.getUnaligned();
732                 int success = pdiffs + 1;       //guilty until proven innocent
733                 
734                 //can you find the primer
735                 for(int i=0;i<numFPrimers;i++){
736                         string oligo = forPrimer[i];
737
738                         if(rawSequence.length() < oligo.length()){      
739                                 success = pdiffs + 1;
740                                 break;
741                         }
742                         
743                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
744                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
745                                 success = 0;
746                                 break;
747                         }
748                 }
749                 
750                 //if you found the barcode or if you don't want to allow for diffs
751 //              cout << success;
752                 if ((pdiffs == 0) || (success == 0)) { return success;  }
753                 
754                 else { //try aligning and see if you can find it
755 //                      cout << endl;
756
757                         int maxLength = 0;
758
759                         Alignment* alignment;
760                         if (numFPrimers > 0) {
761
762                                 for(int i=0;i<numFPrimers;i++){
763                                         if(forPrimer[i].length() > maxLength){
764                                                 maxLength = forPrimer[i].length();
765                                         }
766                                 }
767                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1));  
768
769                         }else{ alignment = NULL; } 
770                         
771                         //can you find the barcode
772                         int minDiff = 1e6;
773                         int minCount = 1;
774                         int minPos = 0;
775                         
776                         for(int i=0;i<numFPrimers;i++){
777                                 string oligo = forPrimer[i];
778                                 
779                                 if(rawSequence.length() < maxLength){   
780                                         success = pdiffs + 100;
781                                         break;
782                                 }
783                                 
784                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
785                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
786                                 oligo = alignment->getSeqAAln();
787                                 string temp = alignment->getSeqBAln();
788                 
789                                 int alnLength = oligo.length();
790                                 
791                                 for(int i=oligo.length()-1;i>=0;i--){
792                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
793                                 }
794                                 oligo = oligo.substr(0,alnLength);
795                                 temp = temp.substr(0,alnLength);
796
797                                 int newStart=0;
798                                 int numDiff = countDiffs(oligo, temp);
799                                 if(numDiff < minDiff){
800                                         minDiff = numDiff;
801                                         minCount = 1;
802                                         minPos = 0;
803                                         for(int i=0;i<alnLength;i++){
804                                                 if(temp[i] != '-'){
805                                                         minPos++;
806                                                 }
807                                         }
808                                 }
809                                 else if(numDiff == minDiff){
810                                         minCount++;
811                                 }
812
813                         }
814                         if(minDiff > pdiffs)    {       success =  minDiff;             }
815                         else if(minCount > 1)   {       success =  pdiffs + 10; }
816                         else{
817                                 seq.setUnaligned(rawSequence.substr(minPos));
818                                 success = minDiff;
819                         }
820                         
821                         if (alignment != NULL) {  delete alignment;  }
822                         
823                 }
824                 return success;
825
826         }
827         catch(exception& e) {
828                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
829                 exit(1);
830         }
831 }
832
833 //***************************************************************************************************************
834
835 bool TrimSeqsCommand::stripReverse(Sequence& seq){
836         try {
837                 string rawSequence = seq.getUnaligned();
838                 bool success = 0;       //guilty until proven innocent
839                 
840                 for(int i=0;i<numRPrimers;i++){
841                         string oligo = revPrimer[i];
842                         
843                         if(rawSequence.length() < oligo.length()){
844                                 success = 0;
845                                 break;
846                         }
847                         
848                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
849                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
850                                 success = 1;
851                                 break;
852                         }
853                 }       
854                 return success;
855                 
856         }
857         catch(exception& e) {
858                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
859                 exit(1);
860         }
861 }
862
863 //***************************************************************************************************************
864
865 bool TrimSeqsCommand::cullLength(Sequence& seq){
866         try {
867         
868                 int length = seq.getNumBases();
869                 bool success = 0;       //guilty until proven innocent
870                 
871                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
872                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
873                 else                                                                                            {       success = 0;    }
874                 
875                 return success;
876         
877         }
878         catch(exception& e) {
879                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
880                 exit(1);
881         }
882         
883 }
884
885 //***************************************************************************************************************
886
887 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
888         try {
889                 int longHomoP = seq.getLongHomoPolymer();
890                 bool success = 0;       //guilty until proven innocent
891                 
892                 if(longHomoP <= maxHomoP){      success = 1;    }
893                 else                                    {       success = 0;    }
894                 
895                 return success;
896         }
897         catch(exception& e) {
898                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
899                 exit(1);
900         }
901         
902 }
903
904 //***************************************************************************************************************
905
906 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
907         try {
908                 int numNs = seq.getAmbigBases();
909                 bool success = 0;       //guilty until proven innocent
910                 
911                 if(numNs <= maxAmbig)   {       success = 1;    }
912                 else                                    {       success = 0;    }
913                 
914                 return success;
915         }
916         catch(exception& e) {
917                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
918                 exit(1);
919         }
920         
921 }
922
923 //***************************************************************************************************************
924
925 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
926         try {
927                 bool success = 1;
928                 int length = oligo.length();
929                 
930                 for(int i=0;i<length;i++){
931                         
932                         if(oligo[i] != seq[i]){
933                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
934                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
935                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
936                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
937                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
938                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
939                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
940                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
941                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
942                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
943                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
944                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
945                                 
946                                 if(success == 0)        {       break;   }
947                         }
948                         else{
949                                 success = 1;
950                         }
951                 }
952                 
953                 return success;
954         }
955         catch(exception& e) {
956                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
957                 exit(1);
958         }
959
960 }
961 //***************************************************************************************************************
962
963 int TrimSeqsCommand::countDiffs(string oligo, string seq){
964         try {
965
966                 int length = oligo.length();
967                 int countDiffs = 0;
968                 
969                 for(int i=0;i<length;i++){
970                                                                 
971                         if(oligo[i] != seq[i]){
972                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
973                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
974                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
975                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
976                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
977                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
978                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
979                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
980                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
981                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
982                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
983                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
984                                 
985                         }
986                 }
987                 
988                 return countDiffs;
989         }
990         catch(exception& e) {
991                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
992                 exit(1);
993         }
994
995 }
996 //***************************************************************************************************************
997
998 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
999         try {
1000 //              string rawSequence = seq.getUnaligned();
1001 //              int seqLength;  // = rawSequence.length();
1002 //              string name, temp, temp2;
1003 //              
1004 //              qFile >> name;
1005 //              
1006 //              //get rest of line
1007 //              temp = "";
1008 //              while (!qFile.eof())    {       
1009 //                      char c = qFile.get(); 
1010 //                      if (c == 10 || c == 13){        break;  }       
1011 //                      else { temp += c; }
1012 //              } 
1013 //      
1014 //              int pos = temp.find("length");
1015 //              if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine();  seqLength = 0;  }
1016 //              else {
1017 //                      string tempLength = temp.substr(pos);
1018 //                      istringstream iss (tempLength,istringstream::in);
1019 //                      iss >> temp;
1020 //              }
1021 //              
1022 //              splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1023 //              convert(temp, seqLength); //converts string to int
1024 //      
1025 //              if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1026                 
1027                 string rawSequence = seq.getUnaligned();
1028                 int seqLength = seq.getNumBases();
1029                 bool success = 0;       //guilty until proven innocent
1030                 string name;
1031                 
1032                 qFile >> name;
1033                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1034                 
1035                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1036                 
1037                 int score;
1038                 int end = seqLength;
1039                 
1040                 for(int i=0;i<seqLength;i++){
1041                         qFile >> score;
1042                         
1043                         if(score < qThreshold){
1044                                 end = i;
1045                                 break;
1046                         }
1047                 }
1048                 for(int i=end+1;i<seqLength;i++){
1049                         qFile >> score;
1050                 }
1051                 
1052                 seq.setUnaligned(rawSequence.substr(0,end));
1053                 
1054                 return 1;
1055         }
1056         catch(exception& e) {
1057                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1058                 exit(1);
1059         }
1060 }
1061
1062 //***************************************************************************************************************
1063
1064 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1065         try {
1066                 string rawSequence = seq.getUnaligned();
1067                 int seqLength = seq.getNumBases();
1068                 bool success = 0;       //guilty until proven innocent
1069                 string name;
1070                 
1071                 qFile >> name;
1072                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1073                 
1074                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1075                 
1076                 float score;    
1077                 float average = 0;
1078                 
1079                 for(int i=0;i<seqLength;i++){
1080                         qFile >> score;
1081                         average += score;
1082                 }
1083                 average /= seqLength;
1084
1085                 if(average >= qAverage) {       success = 1;    }
1086                 else                                    {       success = 0;    }
1087                 
1088                 return success;
1089         }
1090         catch(exception& e) {
1091                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1092                 exit(1);
1093         }
1094 }
1095
1096 //***************************************************************************************************************