]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
1.10.0
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //***************************************************************************************************************
14
15 TrimSeqsCommand::TrimSeqsCommand(string option)  {
16         try {
17                 
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
26                                                                         "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
27                         
28                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
29                         
30                         OptionParser parser(option);
31                         map<string,string> parameters = parser.getParameters();
32                         
33                         ValidParameters validParameter;
34                         map<string,string>::iterator it;
35                         
36                         //check to make sure all parameters are valid for command
37                         for (it = parameters.begin(); it != parameters.end(); it++) { 
38                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
39                         }
40                         
41                         //if the user changes the input directory command factory will send this info to us in the output parameter 
42                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
43                         if (inputDir == "not found"){   inputDir = "";          }
44                         else {
45                                 string path;
46                                 it = parameters.find("fasta");
47                                 //user has given a template file
48                                 if(it != parameters.end()){ 
49                                         path = hasPath(it->second);
50                                         //if the user has not given a path then, add inputdir. else leave path alone.
51                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
52                                 }
53                                 
54                                 it = parameters.find("oligos");
55                                 //user has given a template file
56                                 if(it != parameters.end()){ 
57                                         path = hasPath(it->second);
58                                         //if the user has not given a path then, add inputdir. else leave path alone.
59                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
60                                 }
61                                 
62                                 it = parameters.find("qfile");
63                                 //user has given a template file
64                                 if(it != parameters.end()){ 
65                                         path = hasPath(it->second);
66                                         //if the user has not given a path then, add inputdir. else leave path alone.
67                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
68                                 }
69                         }
70
71                         
72                         //check for required parameters
73                         fastaFile = validParameter.validFile(parameters, "fasta", true);
74                         if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
75                         else if (fastaFile == "not open") { abort = true; }     
76                         
77                         //if the user changes the output directory command factory will send this info to us in the output parameter 
78                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
79                                 outputDir = ""; 
80                                 outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it  
81                         }
82                 
83                         //check for optional parameter and set defaults
84                         // ...at some point should added some additional type checking...
85                         string temp;
86                         temp = validParameter.validFile(parameters, "flip", false);
87                         if (temp == "not found"){       flip = 0;       }
88                         else if(isTrue(temp))   {       flip = 1;       }
89                 
90                         temp = validParameter.validFile(parameters, "oligos", true);
91                         if (temp == "not found"){       oligoFile = "";         }
92                         else if(temp == "not open"){    abort = true;   } 
93                         else                                    {       oligoFile = temp;               }
94                         
95                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
96                         convert(temp, maxAmbig);  
97
98                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
99                         convert(temp, maxHomoP);  
100
101                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
102                         convert(temp, minLength); 
103                         
104                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
105                         convert(temp, maxLength);
106                         
107                         
108                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found") { temp = "0"; }
109                         convert(temp, bdiffs);
110                         
111                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
112                         convert(temp, pdiffs);
113                         
114                         temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
115                         convert(temp, tdiffs);
116                         
117                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
118                         
119                         temp = validParameter.validFile(parameters, "qfile", true);     
120                         if (temp == "not found")        {       qFileName = "";         }
121                         else if(temp == "not open")     {       abort = true;           }
122                         else                                            {       qFileName = temp;       }
123                         
124                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
125                         convert(temp, qThreshold);
126                         
127                         temp = validParameter.validFile(parameters, "qtrim", false);    if (temp == "not found") { temp = "F"; }
128                         qtrim = isTrue(temp);
129
130                         temp = validParameter.validFile(parameters, "qaverage", false);         if (temp == "not found") { temp = "0"; }
131                         convert(temp, qAverage);
132                         
133                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
134                         allFiles = isTrue(temp);
135                         
136                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
137                         convert(temp, processors); 
138                         
139                         if(allFiles && oligoFile == ""){
140                                 m->mothurOut("You selected allfiles, but didn't enter an oligos file.  Ignoring the allfiles request."); m->mothurOutEndLine();
141                         }
142                         if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
143                                 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
144                                 qAverage=0;
145                                 qThreshold=0;
146                         }
147                         if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){               
148                                 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
149                                 abort = true;
150                         }
151                 }
152
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
156                 exit(1);
157         }
158 }
159 //**********************************************************************************************************************
160
161 void TrimSeqsCommand::help(){
162         try {
163                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
164                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
165                 m->mothurOut("The fasta parameter is required.\n");
166                 m->mothurOut("The flip parameter .... The default is 0.\n");
167                 m->mothurOut("The oligos parameter .... The default is "".\n");
168                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
169                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
170                 m->mothurOut("The minlength parameter .... The default is 0.\n");
171                 m->mothurOut("The maxlength parameter .... The default is 0.\n");
172                 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
173                 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
174                 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
175                 m->mothurOut("The qfile parameter .....\n");
176                 m->mothurOut("The qthreshold parameter .... The default is 0.\n");
177                 m->mothurOut("The qaverage parameter .... The default is 0.\n");
178                 m->mothurOut("The allfiles parameter .... The default is F.\n");
179                 m->mothurOut("The qtrim parameter .... The default is F.\n");
180                 m->mothurOut("The trim.seqs command should be in the following format: \n");
181                 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  \n");
182                 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
183                 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
184                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
185                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "TrimSeqsCommand", "help");
190                 exit(1);
191         }
192 }
193
194
195 //***************************************************************************************************************
196
197 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
198
199 //***************************************************************************************************************
200
201 int TrimSeqsCommand::execute(){
202         try{
203         
204                 if (abort == true) { return 0; }
205                 
206                 numFPrimers = 0;  //this needs to be initialized
207                 numRPrimers = 0;
208                 
209                 string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta";
210                 outputNames.push_back(trimSeqFile);
211                 string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta";
212                 outputNames.push_back(scrapSeqFile);
213                 string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups";
214                 
215                 vector<string> fastaFileNames;
216                 if(oligoFile != ""){
217                         outputNames.push_back(groupFile);
218                         getOligos(fastaFileNames);
219                 }
220                 
221                 if(qFileName != "")     {       setLines(qFileName, qLines);    }
222
223                 
224                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225                                 if(processors == 1){
226                                         ifstream inFASTA;
227                                         openInputFile(fastaFile, inFASTA);
228                                         int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
229                                         inFASTA.close();
230                                         
231                                         lines.push_back(new linePair(0, numSeqs));
232                                         
233                                         driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
234                                         
235                                         for (int j = 0; j < fastaFileNames.size(); j++) {
236                                                 rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
237                                         }
238
239                                 }else{
240                                         setLines(fastaFile, lines);     
241                                         if(qFileName == "")     {       qLines = lines; }       
242                                                                 
243                                         createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); 
244                                         
245                                         rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
246                                         rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str());
247                                         rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str());
248                                         for (int j = 0; j < fastaFileNames.size(); j++) {
249                                                 rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str());
250                                         }
251                                         //append files
252                                         for(int i=1;i<processors;i++){
253                                                 appendFiles((trimSeqFile + toString(processIDS[i]) + ".temp"), trimSeqFile);
254                                                 remove((trimSeqFile + toString(processIDS[i]) + ".temp").c_str());
255                                                 appendFiles((scrapSeqFile + toString(processIDS[i]) + ".temp"), scrapSeqFile);
256                                                 remove((scrapSeqFile + toString(processIDS[i]) + ".temp").c_str());
257                                                 appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
258                                                 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
259                                                 for (int j = 0; j < fastaFileNames.size(); j++) {
260                                                         appendFiles((fastaFileNames[j] + toString(processIDS[i]) + ".temp"), fastaFileNames[j]);
261                                                         remove((fastaFileNames[j] + toString(processIDS[i]) + ".temp").c_str());
262                                                 }
263                                         }
264                                 }
265                                 
266                                 if (m->control_pressed) {  return 0; }
267                 #else
268                                 ifstream inFASTA;
269                                 openInputFile(fastaFile, inFASTA);
270                                 int numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
271                                 inFASTA.close();
272                                 
273                                 lines.push_back(new linePair(0, numSeqs));
274                                 
275                                 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]);
276                                 
277                                 if (m->control_pressed) {  return 0; }
278                 #endif
279                                                 
280                                                                                 
281                 for(int i=0;i<fastaFileNames.size();i++){
282                         ifstream inFASTA;
283                         string seqName;
284                         openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA);
285                         ofstream outGroups;
286                         openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups);
287                         outputNames.push_back(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups");
288                         
289                         while(!inFASTA.eof()){
290                                 if(inFASTA.get() == '>'){
291                                         inFASTA >> seqName;
292                                         outGroups << seqName << '\t' << groupVector[i] << endl;
293                                 }
294                                 while (!inFASTA.eof())  {       char c = inFASTA.get(); if (c == 10 || c == 13){        break;  }       }
295                         }
296                         outGroups.close();
297                         inFASTA.close();
298                 }
299                 
300                 if (m->control_pressed) { 
301                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
302                         return 0;
303                 }
304
305                 m->mothurOutEndLine();
306                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
307                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
308                 m->mothurOutEndLine();
309                 
310                 return 0;       
311                         
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "TrimSeqsCommand", "execute");
315                 exit(1);
316         }
317 }
318                 
319 /**************************************************************************************/
320 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames, linePair* line, linePair* qline) {     
321         try {
322                 
323                 ofstream outFASTA;
324                 int able = openOutputFile(trimFile, outFASTA);
325                 
326                 ofstream scrapFASTA;
327                 openOutputFile(scrapFile, scrapFASTA);
328                 
329                 ofstream outGroups;
330                 vector<ofstream*> fastaFileNames;
331                 if (oligoFile != "") {          
332                         openOutputFile(groupFile, outGroups);   
333                         for (int i = 0; i < fastaNames.size(); i++) {
334                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
335                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); 
336                         #else
337                                 fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate));                      
338                         #endif
339                         }
340                 }
341                 
342                 ifstream inFASTA;
343                 openInputFile(filename, inFASTA);
344                 
345                 ifstream qFile;
346                 if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
347                 
348                 qFile.seekg(qline->start);
349                 inFASTA.seekg(line->start);
350                 
351                 for(int i=0;i<line->num;i++){
352                                 
353                         if (m->control_pressed) { 
354                                 inFASTA.close(); 
355                                 outFASTA.close();
356                                 scrapFASTA.close();
357                                 if (oligoFile != "") {   outGroups.close();   }
358                                 if(qFileName != "")     {       qFile.close();  }
359                                 for(int i=0;i<fastaFileNames.size();i++){
360                                         fastaFileNames[i]->close();
361                                         delete fastaFileNames[i];
362                                 }       
363                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
364                                 return 0;
365                         }
366                         
367                         bool success = 1;
368                         
369                         Sequence currSeq(inFASTA);
370
371                         string origSeq = currSeq.getUnaligned();
372                         if (origSeq != "") {
373                                 int group;
374                                 string trashCode = "";
375                                 int currentSeqsDiffs = 0;
376                                 
377                                 if(qFileName != ""){
378                                         if(qThreshold != 0)             {       success = stripQualThreshold(currSeq, qFile);   }
379                                         else if(qAverage != 0)  {       success = cullQualAverage(currSeq, qFile);              }
380                                         if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { 
381                                                 success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap
382                                         }
383                                         if(!success)                    {       trashCode += 'q';                                                               }
384                                 }
385                         
386                                 if(barcodes.size() != 0){
387                                         success = stripBarcode(currSeq, group);
388 //                                      cout << "here: " << success << endl;
389                                         if(success > bdiffs){   trashCode += 'b';       }
390                                         else{ currentSeqsDiffs += success;  }
391                                 }
392
393                                 if(numFPrimers != 0){
394                                         success = stripForward(currSeq);
395                                         if(success > pdiffs){   trashCode += 'f';       }
396                                         else{ currentSeqsDiffs += success;  }
397                                 }
398                                 
399                                 if (currentSeqsDiffs > tdiffs) { trashCode += 't';   }
400
401                                 if(numRPrimers != 0){
402                                         success = stripReverse(currSeq);
403                                         if(!success){   trashCode += 'r';       }
404                                 }
405                 
406                                 if(minLength > 0 || maxLength > 0){
407                                         success = cullLength(currSeq);
408                                         if(!success){   trashCode += 'l'; }
409                                 }
410                                 if(maxHomoP > 0){
411                                         success = cullHomoP(currSeq);
412                                         if(!success){   trashCode += 'h';       }
413                                 }
414                                 if(maxAmbig != -1){
415                                         success = cullAmbigs(currSeq);
416                                         if(!success){   trashCode += 'n';       }
417                                 }
418                                 
419                                 if(flip){       currSeq.reverseComplement();    }               // should go last                       
420                                 
421                                 if(trashCode.length() == 0){
422                                         currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
423                                         currSeq.printSequence(outFASTA);
424                                         if(barcodes.size() != 0){
425                                                 outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;
426                                                 
427                                                 if(allFiles){
428                                                         currSeq.printSequence(*fastaFileNames[group]);                                  
429                                                 }
430                                         }
431                                 }
432                                 else{
433                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
434                                         currSeq.setUnaligned(origSeq);
435                                         currSeq.printSequence(scrapFASTA);
436                                 }
437                         }
438                         gobble(inFASTA);
439                 }
440                 
441                 inFASTA.close();
442                 outFASTA.close();
443                 scrapFASTA.close();
444                 if (oligoFile != "") {   outGroups.close();   }
445                 if(qFileName != "")     {       qFile.close();  }
446                 
447                 for(int i=0;i<fastaFileNames.size();i++){
448                         fastaFileNames[i]->close();
449                         delete fastaFileNames[i];
450                 }               
451                 
452                 return 0;
453         }
454         catch(exception& e) {
455                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
456                 exit(1);
457         }
458 }
459 /**************************************************************************************************/
460 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector<string> fastaNames) {
461         try {
462 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
463                 int process = 0;
464                 int exitCommand = 1;
465                 processIDS.clear();
466                 
467                 //loop through and create all the processes you want
468                 while (process != processors) {
469                         int pid = fork();
470                         
471                         if (pid > 0) {
472                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
473                                 process++;
474                         }else if (pid == 0){
475                                 driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]);
476                                 exit(0);
477                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
478                 }
479                 
480                 //force parent to wait until all the processes are done
481                 for (int i=0;i<processors;i++) { 
482                         int temp = processIDS[i];
483                         wait(&temp);
484                 }
485                 
486                 return exitCommand;
487 #endif          
488         }
489         catch(exception& e) {
490                 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
491                 exit(1);
492         }
493 }
494 /**************************************************************************************************/
495
496 int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
497         try {
498                 
499                 lines.clear();
500                 
501                 vector<long int> positions;
502                 
503                 ifstream inFASTA;
504                 openInputFile(filename, inFASTA);
505                         
506                 string input;
507                 while(!inFASTA.eof()){  
508                         input = getline(inFASTA);
509
510                         if (input.length() != 0) {
511                                 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);     }
512                         }
513                 }
514                 inFASTA.close();
515                 
516                 int numFastaSeqs = positions.size();
517         
518                 FILE * pFile;
519                 long size;
520                 
521                 //get num bytes in file
522                 pFile = fopen (filename.c_str(),"rb");
523                 if (pFile==NULL) perror ("Error opening file");
524                 else{
525                         fseek (pFile, 0, SEEK_END);
526                         size=ftell (pFile);
527                         fclose (pFile);
528                 }
529                 
530                 int numSeqsPerProcessor = numFastaSeqs / processors;
531                 
532                 for (int i = 0; i < processors; i++) {
533
534                         long int startPos = positions[ i * numSeqsPerProcessor ];
535                         if(i == processors - 1){
536                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
537                         }else{  
538                                 long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
539                         }
540                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
541                 }
542                 
543                 return numFastaSeqs;
544         }
545         catch(exception& e) {
546                 m->errorOut(e, "TrimSeqsCommand", "setLines");
547                 exit(1);
548         }
549 }
550 //***************************************************************************************************************
551
552 void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*>& outFASTAVec
553         try {
554                 ifstream inOligos;
555                 openInputFile(oligoFile, inOligos);
556                 
557                 ofstream test;
558                 
559                 string type, oligo, group;
560                 int index=0;
561                 
562                 while(!inOligos.eof()){
563                         inOligos >> type;
564                         
565                         if(type[0] == '#'){
566                                 while (!inOligos.eof()) {       char c = inOligos.get(); if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
567                         }
568                         else{
569                                 inOligos >> oligo;
570                                 
571                                 for(int i=0;i<oligo.length();i++){
572                                         oligo[i] = toupper(oligo[i]);
573                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
574                                 }
575                                 
576                                 if(type == "forward"){
577                                         forPrimer.push_back(oligo);
578                                 }
579                                 else if(type == "reverse"){
580                                         Sequence oligoRC("reverse", oligo);
581                                         oligoRC.reverseComplement();
582                                         revPrimer.push_back(oligoRC.getUnaligned());
583                                 }
584                                 else if(type == "barcode"){
585                                         inOligos >> group;
586                                         barcodes[oligo]=index++;
587                                         groupVector.push_back(group);
588                                         
589                                         if(allFiles){
590                                                 //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate));
591                                                 outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
592                                                 outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta"));
593                                         }
594                                 }
595                         }
596                 }
597                 
598                 inOligos.close();
599                 
600                 numFPrimers = forPrimer.size();
601                 numRPrimers = revPrimer.size();
602                 
603         }
604         catch(exception& e) {
605                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
606                 exit(1);
607         }
608 }
609 //***************************************************************************************************************
610
611 int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
612         try {
613                 string rawSequence = seq.getUnaligned();
614                 bool success = bdiffs + 1;      //guilty until proven innocent
615                 
616                 //can you find the barcode
617                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
618                         string oligo = it->first;
619                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
620                                 success = bdiffs + 1;
621                                 break;
622                         }
623                         
624                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
625                                 group = it->second;
626                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
627                                 success = 0;
628                                 break;
629                         }
630                 }
631                 
632                 //if you found the barcode or if you don't want to allow for diffs
633 //              cout << success;
634                 if ((bdiffs == 0) || (success == 0)) { return success;  }
635                 
636                 else { //try aligning and see if you can find it
637 //                      cout << endl;
638
639                         int maxLength = 0;
640
641                         Alignment* alignment;
642                         if (barcodes.size() > 0) {
643                                 map<string,int>::iterator it=barcodes.begin();
644
645                                 for(it;it!=barcodes.end();it++){
646                                         if(it->first.length() > maxLength){
647                                                 maxLength = it->first.length();
648                                         }
649                                 }
650                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1));  
651
652                         }else{ alignment = NULL; } 
653                         
654                         //can you find the barcode
655                         int minDiff = 1e6;
656                         int minCount = 1;
657                         int minGroup = -1;
658                         int minPos = 0;
659                         
660                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
661                                 string oligo = it->first;
662 //                              int length = oligo.length();
663                                 
664                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
665                                         success = bdiffs + 1;
666                                         break;
667                                 }
668                                 
669                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
670                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
671                                 oligo = alignment->getSeqAAln();
672                                 string temp = alignment->getSeqBAln();
673                 
674                                 int alnLength = oligo.length();
675                                 
676                                 for(int i=oligo.length()-1;i>=0;i--){
677                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
678                                 }
679                                 oligo = oligo.substr(0,alnLength);
680                                 temp = temp.substr(0,alnLength);
681 //                              
682                                 
683                                 int newStart=0;
684                                 int numDiff = countDiffs(oligo, temp);
685                                 if(numDiff < minDiff){
686                                         minDiff = numDiff;
687                                         minCount = 1;
688                                         minGroup = it->second;
689                                         minPos = 0;
690                                         for(int i=0;i<alnLength;i++){
691                                                 if(temp[i] != '-'){
692                                                         minPos++;
693                                                 }
694                                         }
695                                 }
696                                 else if(numDiff == minDiff){
697                                         minCount++;
698                                 }
699
700                         }
701                         if(minDiff > bdiffs){   success =  bdiffs + 1;  }
702                         else if(minCount > 1)   {       success =  bdiffs + 1;  }
703                         else{
704                                 group = minGroup;
705                                 seq.setUnaligned("*" + rawSequence.substr(minPos));
706                                 success = minDiff;
707                         }
708                         
709                         if (alignment != NULL) {  delete alignment;  }
710                         
711                 }
712                 return success;
713                 
714         }
715         catch(exception& e) {
716                 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
717                 exit(1);
718         }
719
720 }
721
722 //***************************************************************************************************************
723
724 int TrimSeqsCommand::stripForward(Sequence& seq){
725         try {
726                 string rawSequence = seq.getUnaligned();
727                 bool success = pdiffs + 1;      //guilty until proven innocent
728                 
729                 //can you find the primer
730                 for(int i=0;i<numFPrimers;i++){
731                         string oligo = forPrimer[i];
732
733                         if(rawSequence.length() < oligo.length()){      
734                                 success = pdiffs + 1;
735                                 break;
736                         }
737                         
738                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
739                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
740                                 success = 0;
741                                 break;
742                         }
743                 }
744                 
745                 //if you found the barcode or if you don't want to allow for diffs
746 //              cout << success;
747                 if ((pdiffs == 0) || (success == 0)) { return success;  }
748                 
749                 else { //try aligning and see if you can find it
750 //                      cout << endl;
751
752                         int maxLength = 0;
753
754                         Alignment* alignment;
755                         if (numFPrimers > 0) {
756
757                                 for(int i=0;i<numFPrimers;i++){
758                                         if(forPrimer[i].length() > maxLength){
759                                                 maxLength = forPrimer[i].length();
760                                         }
761                                 }
762                                 alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1));  
763
764                         }else{ alignment = NULL; } 
765                         
766                         //can you find the barcode
767                         int minDiff = 1e6;
768                         int minCount = 1;
769                         int minPos = 0;
770                         
771                         for(int i=0;i<numFPrimers;i++){
772                                 string oligo = forPrimer[i];
773                                 
774                                 if(rawSequence.length() < maxLength){   
775                                         success = pdiffs + 1;
776                                         break;
777                                 }
778                                 
779                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
780                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
781                                 oligo = alignment->getSeqAAln();
782                                 string temp = alignment->getSeqBAln();
783                 
784                                 int alnLength = oligo.length();
785                                 
786                                 for(int i=oligo.length()-1;i>=0;i--){
787                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
788                                 }
789                                 oligo = oligo.substr(0,alnLength);
790                                 temp = temp.substr(0,alnLength);
791 //                                                              
792                                 int newStart=0;
793                                 int numDiff = countDiffs(oligo, temp);
794                                 if(numDiff < minDiff){
795                                         minDiff = numDiff;
796                                         minCount = 1;
797                                         minPos = 0;
798                                         for(int i=0;i<alnLength;i++){
799                                                 if(temp[i] != '-'){
800                                                         minPos++;
801                                                 }
802                                         }
803                                 }
804                                 else if(numDiff == minDiff){
805                                         minCount++;
806                                 }
807
808                         }
809                         if(minDiff > pdiffs){   success =  pdiffs + 1;  }
810                         else if(minCount > 1)   {       success =  pdiffs + 1;  }
811                         else{
812                                 seq.setUnaligned("*" + rawSequence.substr(minPos));
813                                 success = minDiff;
814                         }
815                         
816                         if (alignment != NULL) {  delete alignment;  }
817                         
818                 }
819                 return success;
820
821         }
822         catch(exception& e) {
823                 m->errorOut(e, "TrimSeqsCommand", "stripForward");
824                 exit(1);
825         }
826 }
827
828 //***************************************************************************************************************
829
830 bool TrimSeqsCommand::stripReverse(Sequence& seq){
831         try {
832                 string rawSequence = seq.getUnaligned();
833                 bool success = 0;       //guilty until proven innocent
834                 
835                 for(int i=0;i<numRPrimers;i++){
836                         string oligo = revPrimer[i];
837                         
838                         if(rawSequence.length() < oligo.length()){
839                                 success = 0;
840                                 break;
841                         }
842                         
843                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
844                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
845                                 success = 1;
846                                 break;
847                         }
848                 }       
849                 return success;
850                 
851         }
852         catch(exception& e) {
853                 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
854                 exit(1);
855         }
856 }
857
858 //***************************************************************************************************************
859
860 bool TrimSeqsCommand::cullLength(Sequence& seq){
861         try {
862         
863                 int length = seq.getNumBases();
864                 bool success = 0;       //guilty until proven innocent
865                 
866                 if(length >= minLength && maxLength == 0)                       {       success = 1;    }
867                 else if(length >= minLength && length <= maxLength)     {       success = 1;    }
868                 else                                                                                            {       success = 0;    }
869                 
870                 return success;
871         
872         }
873         catch(exception& e) {
874                 m->errorOut(e, "TrimSeqsCommand", "cullLength");
875                 exit(1);
876         }
877         
878 }
879
880 //***************************************************************************************************************
881
882 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
883         try {
884                 int longHomoP = seq.getLongHomoPolymer();
885                 bool success = 0;       //guilty until proven innocent
886                 
887                 if(longHomoP <= maxHomoP){      success = 1;    }
888                 else                                    {       success = 0;    }
889                 
890                 return success;
891         }
892         catch(exception& e) {
893                 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
894                 exit(1);
895         }
896         
897 }
898
899 //***************************************************************************************************************
900
901 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
902         try {
903                 int numNs = seq.getAmbigBases();
904                 bool success = 0;       //guilty until proven innocent
905                 
906                 if(numNs <= maxAmbig)   {       success = 1;    }
907                 else                                    {       success = 0;    }
908                 
909                 return success;
910         }
911         catch(exception& e) {
912                 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
913                 exit(1);
914         }
915         
916 }
917
918 //***************************************************************************************************************
919
920 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
921         try {
922                 bool success = 1;
923                 int length = oligo.length();
924                 
925                 for(int i=0;i<length;i++){
926                         
927                         if(oligo[i] != seq[i]){
928                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
929                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
930                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
931                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
932                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
933                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
934                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
935                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
936                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
937                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
938                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
939                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
940                                 
941                                 if(success == 0)        {       break;   }
942                         }
943                         else{
944                                 success = 1;
945                         }
946                 }
947                 
948                 return success;
949         }
950         catch(exception& e) {
951                 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
952                 exit(1);
953         }
954
955 }
956 //***************************************************************************************************************
957
958 int TrimSeqsCommand::countDiffs(string oligo, string seq){
959         try {
960
961                 int length = oligo.length();
962                 int countDiffs = 0;
963                 
964                 for(int i=0;i<length;i++){
965                                                                 
966                         if(oligo[i] != seq[i]){
967                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
968                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
969                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
970                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
971                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
972                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
973                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
974                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
975                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
976                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
977                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
978                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }                       
979                                 
980                         }
981                 }
982                 
983                 return countDiffs;
984         }
985         catch(exception& e) {
986                 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
987                 exit(1);
988         }
989
990 }
991 //***************************************************************************************************************
992
993 bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){
994         try {
995                 string rawSequence = seq.getUnaligned();
996                 int seqLength;  // = rawSequence.length();
997                 string name, temp, temp2;
998                 
999                 qFile >> name >> temp;
1000         
1001                 splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242
1002                 convert(temp, seqLength); //converts string to int
1003         
1004                 if (name.length() != 0) {  if(name.substr(1) != seq.getName())  {       m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); }  } 
1005                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1006                 
1007                 int score;
1008                 int end = seqLength;
1009                 
1010                 for(int i=0;i<seqLength;i++){
1011                         qFile >> score;
1012                         
1013                         if(score <= qThreshold){
1014                                 end = i;
1015                                 break;
1016                         }
1017                 }
1018                 for(int i=end+1;i<seqLength;i++){
1019                         qFile >> score;
1020                 }
1021                 
1022                 seq.setUnaligned(rawSequence.substr(0,end));
1023                 
1024                 return 1;
1025         }
1026         catch(exception& e) {
1027                 m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold");
1028                 exit(1);
1029         }
1030 }
1031
1032 //***************************************************************************************************************
1033
1034 bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){
1035         try {
1036                 string rawSequence = seq.getUnaligned();
1037                 int seqLength = seq.getNumBases();
1038                 bool success = 0;       //guilty until proven innocent
1039                 string name;
1040                 
1041                 qFile >> name;
1042                 if (name[0] == '>') {  if(name.substr(1) != seq.getName())      {       m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); }  }
1043                 
1044                 while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13){  break;  }       }
1045                 
1046                 float score;    
1047                 float average = 0;
1048                 
1049                 for(int i=0;i<seqLength;i++){
1050                         qFile >> score;
1051                         average += score;
1052                 }
1053                 average /= seqLength;
1054
1055                 if(average >= qAverage) {       success = 1;    }
1056                 else                                    {       success = 0;    }
1057                 
1058                 return success;
1059         }
1060         catch(exception& e) {
1061                 m->errorOut(e, "TrimSeqsCommand", "cullQualAverage");
1062                 exit(1);
1063         }
1064 }
1065
1066 //***************************************************************************************************************