]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
mods to shhh.seqs and trim.flows to fix eye candy output
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimFlowsCommand::setParameters(){       
15         try {
16                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pflow);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop);
19                 CommandParameter pmaxflows("maxflows", "Number", "", "720", "", "", "",false,false); parameters.push_back(pmaxflows);
20                 CommandParameter pminflows("minflows", "Number", "", "360", "", "", "",false,false); parameters.push_back(pminflows);
21                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
22                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
23                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
25                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27                 CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
28                 CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
29                 CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
30                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
31                 CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                 
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "TrimFlowsCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string TrimFlowsCommand::getHelpString(){       
46         try {
47                 string helpString = "";
48                 helpString += "The trim.flows command reads a flowgram file and creates .....\n";
49                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
50                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n";
51                 return helpString;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "TrimFlowsCommand", "getHelpString");
55                 exit(1);
56         }
57 }
58
59 //**********************************************************************************************************************
60
61 TrimFlowsCommand::TrimFlowsCommand(){   
62         try {
63                 abort = true; calledHelp = true; 
64                 setParameters();
65                 vector<string> tempOutNames;
66                 outputTypes["flow"] = tempOutNames;
67                 outputTypes["fasta"] = tempOutNames;
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
71                 exit(1);
72         }
73 }
74 //**********************************************************************************************************************
75
76 TrimFlowsCommand::TrimFlowsCommand(string option)  {
77         try {
78                 
79                 abort = false; calledHelp = false;   
80                 comboStarts = 0;
81                 
82                 //allow user to run help
83                 if(option == "help") { help(); abort = true; calledHelp = true; }
84                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
85                 
86                 else {
87                                                 
88                         vector<string> myArray = setParameters();
89                         
90                         OptionParser parser(option);
91                         map<string,string> parameters = parser.getParameters();
92                         
93                         ValidParameters validParameter;
94                         map<string,string>::iterator it;
95                         
96                         //check to make sure all parameters are valid for command
97                         for (it = parameters.begin(); it != parameters.end(); it++) { 
98                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
99                         }
100                         
101                         //initialize outputTypes
102                         vector<string> tempOutNames;
103                         outputTypes["flow"] = tempOutNames;
104                         outputTypes["fasta"] = tempOutNames;
105                         
106                         //if the user changes the input directory command factory will send this info to us in the output parameter 
107                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
108                         if (inputDir == "not found"){   inputDir = "";          }
109                         else {
110                                 string path;
111                                 it = parameters.find("flow");
112                                 //user has given a template file
113                                 if(it != parameters.end()){ 
114                                         path = m->hasPath(it->second);
115                                         //if the user has not given a path then, add inputdir. else leave path alone.
116                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
117                                 }
118                                 
119                                 it = parameters.find("oligos");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
125                                 }
126                                 
127                         }
128                         
129                         
130                         //check for required parameters
131                         flowFileName = validParameter.validFile(parameters, "flow", true);
132                         if (flowFileName == "not found") { 
133                                 flowFileName = m->getFlowFile(); 
134                                 if (flowFileName != "") {  m->mothurOut("Using " + flowFileName + " as input file for the flow parameter."); m->mothurOutEndLine(); }
135                                 else { 
136                                         m->mothurOut("No valid current flow file. You must provide a flow file."); m->mothurOutEndLine(); 
137                                         abort = true;
138                                 } 
139                         }else if (flowFileName == "not open") { flowFileName = ""; abort = true; }      
140                         
141                         //if the user changes the output directory command factory will send this info to us in the output parameter 
142                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
143                                 outputDir = ""; 
144                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
145                         }
146                         
147                         
148                         //check for optional parameter and set defaults
149                         // ...at some point should added some additional type checking...
150                         
151                         string temp;
152                         temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "450"; }
153                         convert(temp, minFlows);  
154
155                         temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "450"; }
156                         convert(temp, maxFlows);  
157                         
158                         
159                         temp = validParameter.validFile(parameters, "oligos", true);
160                         if (temp == "not found")        {       oligoFileName = "";             }
161                         else if(temp == "not open")     {       abort = true;                   } 
162                         else                                            {       oligoFileName = temp;   }
163                         
164                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
165                         else if(m->isTrue(temp))        {       fasta = 1;      }
166                         
167                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
168                         convert(temp, maxHomoP);  
169
170                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
171                         convert(temp, signal);  
172
173                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
174                         convert(temp, noise);  
175
176                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
177                         convert(temp, minLength); 
178                         
179                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
180                         convert(temp, maxLength);
181                         
182                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
183                         convert(temp, bdiffs);
184                         
185                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
186                         convert(temp, pdiffs);
187                         
188                         temp = validParameter.validFile(parameters, "tdiffs", false);
189                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
190                         convert(temp, tdiffs);
191                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
192                         
193                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found"){ temp = "T";           }
194                         allFiles = m->isTrue(temp);
195                         
196                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
197                         m->setProcessors(temp);
198                         convert(temp, processors);
199         
200                         flowOrder = validParameter.validFile(parameters, "order", false);
201                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
202                         else if(flowOrder.length() != 4){
203                                 m->mothurOut("The value of the order option must be four bases long\n");
204                         }
205
206                         if(oligoFileName == ""){        allFiles = 0;           }
207
208                         numFPrimers = 0;
209                         numRPrimers = 0;
210                 }
211                 
212         }
213         catch(exception& e) {
214                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
215                 exit(1);
216         }
217 }
218
219 //***************************************************************************************************************
220
221 int TrimFlowsCommand::execute(){
222         try{
223                 
224                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
225
226                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
227                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
228                 
229                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
230                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
231
232                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
233                 if(fasta){
234                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
235                 }
236                 
237                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
238                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
239                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
240                 }       
241
242                 vector<vector<string> > barcodePrimerComboFileNames;
243                 if(oligoFileName != ""){
244                         getOligos(barcodePrimerComboFileNames); 
245                 }
246                 
247 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
248                 if(processors == 1){
249                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
250                 }else{
251                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
252                 }       
253 #else
254                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
255 #endif
256                 
257                 if (m->control_pressed) {  return 0; }                  
258                 
259                 string flowFilesFileName;
260                 ofstream output;
261                 
262                 if(allFiles){
263                         
264                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
265                         m->openOutputFile(flowFilesFileName, output);
266
267                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
268                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
269                                         
270                                         FILE * pFile;
271                                         unsigned long int size;
272                                         
273                                         //get num bytes in file
274                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
275                                         if (pFile==NULL) perror ("Error opening file");
276                                         else{
277                                                 fseek (pFile, 0, SEEK_END);
278                                                 size=ftell(pFile);
279                                                 fclose (pFile);
280                                         }
281
282                                         if(size < 10){
283                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
284                                         }
285                                         else{
286                                                 output << barcodePrimerComboFileNames[i][j] << endl;
287                                                 outputNames.push_back(barcodePrimerComboFileNames[i][j]);
288                                                 outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
289                                         }
290                                 }
291                         }
292                         output.close();
293                 }
294                 else{
295                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
296                         m->openOutputFile(flowFilesFileName, output);
297                         
298                         output << trimFlowFileName << endl;
299                         
300                         output.close();
301                 }
302                 outputTypes["flow.files"].push_back(flowFilesFileName);
303                 outputNames.push_back(flowFilesFileName);
304                 
305 //              set fasta file as new current fastafile
306 //              string current = "";
307 //              itTypes = outputTypes.find("fasta");
308 //              if (itTypes != outputTypes.end()) {
309 //                      if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
310 //              }
311                 
312                 m->mothurOutEndLine();
313                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
314                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
315                 m->mothurOutEndLine();
316                 
317                 return 0;       
318         }
319         catch(exception& e) {
320                 m->errorOut(e, "TrimSeqsCommand", "execute");
321                 exit(1);
322         }
323 }
324
325 //***************************************************************************************************************
326
327 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
328         
329         try {
330                 
331                 ofstream trimFlowFile;
332                 m->openOutputFile(trimFlowFileName, trimFlowFile);
333                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
334
335                 ofstream scrapFlowFile;
336                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
337                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
338
339                 if(line->start == 4){
340                         scrapFlowFile << maxFlows << endl;
341                         trimFlowFile << maxFlows << endl;
342                         if(allFiles){
343                                 for(int i=0;i<barcodePrimerComboFileNames.size();i++){
344                                         for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
345                                                 //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
346                                                 ofstream temp;
347                                                 m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
348                                                 temp << maxFlows << endl;
349                                                 temp.close();
350                                         }
351                                 }                       
352                         }
353                 }
354                 
355                 FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
356                 
357                 ofstream fastaFile;
358                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
359                 
360                 ifstream flowFile;
361                 m->openInputFile(flowFileName, flowFile);
362                 
363                 flowFile.seekg(line->start);
364
365                 int count = 0;
366                 bool moreSeqs = 1;
367                         
368                 while(moreSeqs) {
369                         
370                         int success = 1;
371                         int currentSeqDiffs = 0;
372                         string trashCode = "";
373                         
374                         flowData.getNext(flowFile);
375                         flowData.capFlows(maxFlows);    
376                         
377                         Sequence currSeq = flowData.getSequence();
378                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
379                                 success = 0;
380                                 trashCode += 'l';
381                         }
382                         
383                         if(minLength > 0 || maxLength > 0){     //screen to see if sequence is above and below a specific number of bases
384                                 int seqLength = currSeq.getNumBases();
385                                 if(seqLength < minLength || seqLength > maxLength){
386                                         success = 0;
387                                         trashCode += 'l';
388                                 }
389                         }
390                         
391                         int primerIndex = 0;
392                         int barcodeIndex = 0;
393                         
394                         if(barcodes.size() != 0){
395                                 success = stripBarcode(currSeq, barcodeIndex);
396                                 if(success > bdiffs)            {       trashCode += 'b';       }
397                                 else{ currentSeqDiffs += success;  }
398                         }
399                         
400                         if(numFPrimers != 0){
401                                 success = stripForward(currSeq, primerIndex);
402                                 if(success > pdiffs)            {       trashCode += 'f';       }
403                                 else{ currentSeqDiffs += success;  }
404                         }
405                         
406                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
407                         
408                         if(numRPrimers != 0){
409                                 success = stripReverse(currSeq);
410                                 if(!success)                            {       trashCode += 'r';       }
411                         }
412
413                         if(trashCode.length() == 0){
414                                                         
415                                 flowData.printFlows(trimFlowFile);
416                         
417                                 if(fasta)       {       currSeq.printSequence(fastaFile);       }
418                                 
419                                 if(allFiles){
420                                         ofstream output;
421                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
422                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
423                                         
424                                         flowData.printFlows(output);
425                                         output.close();
426                                 }                               
427                         }
428                         else{
429                                 flowData.printFlows(scrapFlowFile, trashCode);
430                         }
431                                 
432                         count++;
433                                                 
434                         //report progress
435                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
436
437 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
438                         unsigned long int pos = flowFile.tellg();
439
440                         if ((pos == -1) || (pos >= line->end)) { break; }
441 #else
442                         if (flowFile.eof()) { break; }
443 #endif
444                         
445                 }
446                 //report progress
447                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
448                 
449                 trimFlowFile.close();
450                 scrapFlowFile.close();
451                 flowFile.close();
452                 if(fasta){      fastaFile.close();      }
453                 
454                 return count;
455         }
456         catch(exception& e) {
457                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
458                 exit(1);
459         }
460 }
461
462 //***************************************************************************************************************
463
464 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
465         try {
466                 ifstream oligosFile;
467                 m->openInputFile(oligoFileName, oligosFile);
468                 
469                 string type, oligo, group;
470
471                 int indexPrimer = 0;
472                 int indexBarcode = 0;
473                 
474                 while(!oligosFile.eof()){
475                 
476                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
477
478                         if(type[0] == '#'){     //igore the line because there's a comment
479                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
480                         }
481                         else{                           //there's a feature we're interested in
482
483                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
484
485                                 oligosFile >> oligo;    //get the DNA sequence for the feature
486
487                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
488                                         oligo[i] = toupper(oligo[i]);
489                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
490                                 }
491
492                                 if(type == "FORWARD"){  //if the feature is a forward primer...
493                                         group = "";
494
495                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
496                                                 char c = oligosFile.get(); 
497                                                 if (c == 10 || c == 13){        break;  }
498                                                 else if (c == 32 || c == 9){;} //space or tab
499                                                 else {  group += c;  }
500                                         } 
501
502                                         //have we seen this primer already?
503                                         map<string, int>::iterator itPrimer = primers.find(oligo);
504                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
505
506                                         primers[oligo]=indexPrimer; indexPrimer++;
507                                         primerNameVector.push_back(group);
508
509                                 }
510                                 else if(type == "REVERSE"){
511                                         Sequence oligoRC("reverse", oligo);
512                                         oligoRC.reverseComplement();
513                                         revPrimer.push_back(oligoRC.getUnaligned());
514                                 }
515                                 else if(type == "BARCODE"){
516                                         oligosFile >> group;
517
518                                         //check for repeat barcodes
519                                         map<string, int>::iterator itBar = barcodes.find(oligo);
520                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
521
522                                         barcodes[oligo]=indexBarcode; indexBarcode++;
523                                         barcodeNameVector.push_back(group);
524                                 }
525                                 else{
526                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
527                                 }
528                         }
529
530                         m->gobble(oligosFile);
531                 }
532                 oligosFile.close();
533                 
534                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
535                 
536                 //add in potential combos
537                 if(barcodeNameVector.size() == 0){
538                         barcodes[""] = 0;
539                         barcodeNameVector.push_back("");                        
540                 }
541                 
542                 if(primerNameVector.size() == 0){
543                         primers[""] = 0;
544                         primerNameVector.push_back("");                 
545                 }
546                 
547                 
548                 outFlowFileNames.resize(barcodeNameVector.size());
549                 for(int i=0;i<outFlowFileNames.size();i++){
550                         outFlowFileNames[i].assign(primerNameVector.size(), "");
551                 }
552                 
553                 if(allFiles){
554
555                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
556                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
557
558                                         string primerName = primerNameVector[itPrimer->second];
559                                         string barcodeName = barcodeNameVector[itBar->second];
560                                                                                 
561                                         string comboGroupName = "";
562                                         string fileName = "";
563                                         
564                                         if(primerName == ""){
565                                                 comboGroupName = barcodeNameVector[itBar->second];
566                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
567                                         }
568                                         else{
569                                                 if(barcodeName == ""){
570                                                         comboGroupName = primerNameVector[itPrimer->second];
571                                                 }
572                                                 else{
573                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
574                                                 }
575                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
576                                         }
577                                         
578                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
579                                         
580                                         ofstream temp;
581                                         m->openOutputFile(fileName, temp);
582                                         temp.close();
583                                 }
584                         }
585                 }
586                 
587                 numFPrimers = primers.size();
588                 numRPrimers = revPrimer.size();
589                 
590         }
591         catch(exception& e) {
592                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
593                 exit(1);
594         }
595 }
596
597 //***************************************************************************************************************
598
599 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
600         try {
601                 
602                 string rawSequence = seq.getUnaligned();
603                 int success = bdiffs + 1;       //guilty until proven innocent
604                 
605                 //can you find the barcode
606                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
607                         string oligo = it->first;
608                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
609                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
610                                 break;  
611                         }
612                         
613                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
614                                 group = it->second;
615                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
616                                 
617                                 success = 0;
618                                 break;
619                         }
620                 }
621                 
622                 //if you found the barcode or if you don't want to allow for diffs
623                 if ((bdiffs == 0) || (success == 0)) { return success;  }
624                 
625                 else { //try aligning and see if you can find it
626                         
627                         int maxLength = 0;
628                         
629                         Alignment* alignment;
630                         if (barcodes.size() > 0) {
631                                 map<string,int>::iterator it=barcodes.begin();
632                                 
633                                 for(it;it!=barcodes.end();it++){
634                                         if(it->first.length() > maxLength){
635                                                 maxLength = it->first.length();
636                                         }
637                                 }
638                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
639                                 
640                         }else{ alignment = NULL; } 
641                         
642                         //can you find the barcode
643                         int minDiff = 1e6;
644                         int minCount = 1;
645                         int minGroup = -1;
646                         int minPos = 0;
647                         
648                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
649                                 string oligo = it->first;
650                                 //                              int length = oligo.length();
651                                 
652                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
653                                         success = bdiffs + 10;
654                                         break;
655                                 }
656                                 
657                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
658                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
659                                 oligo = alignment->getSeqAAln();
660                                 string temp = alignment->getSeqBAln();
661                                 
662                                 int alnLength = oligo.length();
663                                 
664                                 for(int i=oligo.length()-1;i>=0;i--){
665                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
666                                 }
667                                 oligo = oligo.substr(0,alnLength);
668                                 temp = temp.substr(0,alnLength);
669                                 
670                                 int numDiff = countDiffs(oligo, temp);
671                                 
672                                 if(numDiff < minDiff){
673                                         minDiff = numDiff;
674                                         minCount = 1;
675                                         minGroup = it->second;
676                                         minPos = 0;
677                                         for(int i=0;i<alnLength;i++){
678                                                 if(temp[i] != '-'){
679                                                         minPos++;
680                                                 }
681                                         }
682                                 }
683                                 else if(numDiff == minDiff){
684                                         minCount++;
685                                 }
686                                 
687                         }
688                         
689                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
690                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
691                         else{                                                                                                   //use the best match
692                                 group = minGroup;
693                                 seq.setUnaligned(rawSequence.substr(minPos));
694                                 success = minDiff;
695                         }
696                         
697                         if (alignment != NULL) {  delete alignment;  }
698                         
699                 }
700                 
701                 return success;
702                 
703         }
704         catch(exception& e) {
705                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
706                 exit(1);
707         }
708         
709 }
710
711 //***************************************************************************************************************
712
713 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
714         try {
715                 
716                 string rawSequence = seq.getUnaligned();
717                 int success = pdiffs + 1;       //guilty until proven innocent
718
719                 //can you find the primer
720                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
721                         string oligo = it->first;
722                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
723                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
724                                 break;  
725                         }
726                         
727                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
728                                 group = it->second;
729                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
730                                 success = 0;
731                                 break;
732                         }
733                 }
734                 
735                 //if you found the barcode or if you don't want to allow for diffs
736                 if ((pdiffs == 0) || (success == 0)) {  return success;  }
737                 
738                 else { //try aligning and see if you can find it
739                         
740                         int maxLength = 0;
741                         
742                         Alignment* alignment;
743                         if (primers.size() > 0) {
744                                 map<string,int>::iterator it=primers.begin();
745                                 
746                                 for(it;it!=primers.end();it++){
747                                         if(it->first.length() > maxLength){
748                                                 maxLength = it->first.length();
749                                         }
750                                 }
751                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
752                                 
753                         }else{ alignment = NULL; } 
754                         
755                         //can you find the barcode
756                         int minDiff = 1e6;
757                         int minCount = 1;
758                         int minGroup = -1;
759                         int minPos = 0;
760                         
761                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
762                                 string oligo = it->first;
763                                 //                              int length = oligo.length();
764                                 
765                                 if(rawSequence.length() < maxLength){   
766                                         success = pdiffs + 100;
767                                         break;
768                                 }
769                                 
770                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
771                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
772                                 oligo = alignment->getSeqAAln();
773                                 string temp = alignment->getSeqBAln();
774                                 
775                                 int alnLength = oligo.length();
776                                 
777                                 for(int i=oligo.length()-1;i>=0;i--){
778                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
779                                 }
780                                 oligo = oligo.substr(0,alnLength);
781                                 temp = temp.substr(0,alnLength);
782                                 
783                                 int numDiff = countDiffs(oligo, temp);
784                                 
785                                 if(numDiff < minDiff){
786                                         minDiff = numDiff;
787                                         minCount = 1;
788                                         minGroup = it->second;
789                                         minPos = 0;
790                                         for(int i=0;i<alnLength;i++){
791                                                 if(temp[i] != '-'){
792                                                         minPos++;
793                                                 }
794                                         }
795                                 }
796                                 else if(numDiff == minDiff){
797                                         minCount++;
798                                 }
799                                 
800                         }
801                         
802                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
803                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
804                         else{                                                                                                   //use the best match
805                                 group = minGroup;
806                                 seq.setUnaligned(rawSequence.substr(minPos));
807                                 success = minDiff;
808                         }
809                         
810                         if (alignment != NULL) {  delete alignment;  }
811                         
812                 }
813                 
814                 return success;
815                 
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
819                 exit(1);
820         }
821 }
822
823 //***************************************************************************************************************
824
825 bool TrimFlowsCommand::stripReverse(Sequence& seq){
826         try {
827
828                 string rawSequence = seq.getUnaligned();
829                 bool success = 0;       //guilty until proven innocent
830                 
831                 for(int i=0;i<numRPrimers;i++){
832                         string oligo = revPrimer[i];
833                         
834                         if(rawSequence.length() < oligo.length()){
835                                 success = 0;
836                                 break;
837                         }
838                         
839                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
840                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
841                                 success = 1;
842                                 break;
843                         }
844                 }       
845
846                 return success;
847                 
848         }
849         catch(exception& e) {
850                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
851                 exit(1);
852         }
853 }
854
855
856 //***************************************************************************************************************
857
858 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
859         try {
860                 bool success = 1;
861                 int length = oligo.length();
862                 
863                 for(int i=0;i<length;i++){
864                         
865                         if(oligo[i] != seq[i]){
866                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
867                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
868                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
869                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
870                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
871                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
872                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
873                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
874                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
875                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
876                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
877                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
878                                 
879                                 if(success == 0)        {       break;   }
880                         }
881                         else{
882                                 success = 1;
883                         }
884                 }
885                 
886                 return success;
887         }
888         catch(exception& e) {
889                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
890                 exit(1);
891         }
892         
893 }
894
895 //***************************************************************************************************************
896
897 int TrimFlowsCommand::countDiffs(string oligo, string seq){
898         try {
899                 
900                 int length = oligo.length();
901                 int countDiffs = 0;
902                 
903                 for(int i=0;i<length;i++){
904                         
905                         if(oligo[i] != seq[i]){
906                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
907                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
908                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
909                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
910                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
911                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
912                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
913                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
914                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
915                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
916                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
917                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
918                         }
919                         
920                 }
921                 
922                 return countDiffs;
923         }
924         catch(exception& e) {
925                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
926                 exit(1);
927         }
928         
929 }
930
931 /**************************************************************************************************/
932
933 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
934
935         try{
936                         
937                 vector<unsigned long int> filePos;
938                 filePos.push_back(0);
939                                         
940                 FILE * pFile;
941                 unsigned long int size;
942                 
943                 //get num bytes in file
944                 pFile = fopen (flowFileName.c_str(),"rb");
945                 if (pFile==NULL) perror ("Error opening file");
946                 else{
947                         fseek (pFile, 0, SEEK_END);
948                         size=ftell (pFile);
949                         fclose (pFile);
950                 }
951                                 
952                 //estimate file breaks
953                 unsigned long int chunkSize = 0;
954                 chunkSize = size / processors;
955
956                 //file too small to divide by processors
957                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
958                 
959                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
960                 for (int i = 0; i < processors; i++) {
961                         unsigned long int spot = (i+1) * chunkSize;
962                         
963                         ifstream in;
964                         m->openInputFile(flowFileName, in);
965                         in.seekg(spot);
966                         
967                         string dummy = m->getline(in);
968                         
969                         //there was not another sequence before the end of the file
970                         unsigned long int sanityPos = in.tellg();
971                         
972 //                      if (sanityPos == -1) {  break;  }
973 //                      else {  filePos.push_back(newSpot);  }
974                         if (sanityPos == -1) {  break;  }
975                         else {  filePos.push_back(sanityPos);  }
976                         
977                         in.close();
978                 }
979                 
980                 //save end pos
981                 filePos.push_back(size);
982                 
983                 //sanity check filePos
984                 for (int i = 0; i < (filePos.size()-1); i++) {
985                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
986                 }
987
988                 ifstream in;
989                 m->openInputFile(flowFileName, in);
990                 in >> numFlows;
991                 m->gobble(in);
992                 unsigned long int spot = in.tellg();
993                 filePos[0] = spot;
994                 in.close();
995                 
996                 processors = (filePos.size() - 1);
997                 
998                 return filePos; 
999         }
1000         catch(exception& e) {
1001                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
1002                 exit(1);
1003         }
1004 }
1005
1006 /**************************************************************************************************/
1007
1008 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
1009
1010         try {
1011 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1012                 int process = 1;
1013                 int exitCommand = 1;
1014                 processIDS.clear();
1015                 
1016                 //loop through and create all the processes you want
1017                 while (process != processors) {
1018                         int pid = fork();
1019                         
1020                         if (pid > 0) {
1021                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1022                                 process++;
1023                         }else if (pid == 0){
1024                                 
1025                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1026                                 if(allFiles){
1027                                         for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1028                                                 for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1029                                                         tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1030                                                         ofstream temp;
1031                                                         m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1032                                                         temp.close();
1033                                                         
1034                                                 }
1035                                         }
1036                                 }
1037                                 driverCreateTrim(flowFileName,
1038                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1039                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1040                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1041                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1042
1043                                 exit(0);
1044                         }else { 
1045                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1046                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1047                                 exit(0);
1048                         }
1049                 }
1050                 
1051                 //parent do my part
1052                 ofstream temp;
1053                 m->openOutputFile(trimFlowFileName, temp);
1054                 temp.close();
1055
1056                 m->openOutputFile(scrapFlowFileName, temp);
1057                 temp.close();
1058                 
1059                 if(fasta){
1060                         m->openOutputFile(fastaFileName, temp);
1061                         temp.close();
1062                 }
1063                 
1064                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1065
1066                 //force parent to wait until all the processes are done
1067                 for (int i=0;i<processIDS.size();i++) { 
1068                         int temp = processIDS[i];
1069                         wait(&temp);
1070                 }
1071                 
1072                 //append files
1073                 m->mothurOutEndLine();
1074                 for(int i=0;i<processIDS.size();i++){
1075                         
1076                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1077                         
1078                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1079                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1080 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1081
1082                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1083                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1084 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1085
1086                         if(fasta){
1087                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1088                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1089 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1090                         }
1091                         if(allFiles){                                           
1092                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1093                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1094                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1095                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1096                                         }
1097                                 }
1098                         }
1099                 }
1100                 
1101                 return exitCommand;
1102 #endif          
1103         }
1104         catch(exception& e) {
1105                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1106                 exit(1);
1107         }
1108 }
1109
1110 //***************************************************************************************************************