]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
added citation function to commands
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimFlowsCommand::setParameters(){       
15         try {
16                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pflow);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop);
19                 CommandParameter pmaxflows("maxflows", "Number", "", "720", "", "", "",false,false); parameters.push_back(pmaxflows);
20                 CommandParameter pminflows("minflows", "Number", "", "360", "", "", "",false,false); parameters.push_back(pminflows);
21                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
22                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
23                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
25                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27                 CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
28                 CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
29                 CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
30                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
31                 CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                 
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "TrimFlowsCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string TrimFlowsCommand::getHelpString(){       
46         try {
47                 string helpString = "";
48                 helpString += "The trim.flows command reads a flowgram file and creates .....\n";
49                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
50                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n";
51                 return helpString;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "TrimFlowsCommand", "getHelpString");
55                 exit(1);
56         }
57 }
58
59 //**********************************************************************************************************************
60
61 TrimFlowsCommand::TrimFlowsCommand(){   
62         try {
63                 abort = true; calledHelp = true; 
64                 setParameters();
65                 vector<string> tempOutNames;
66                 outputTypes["flow"] = tempOutNames;
67                 outputTypes["fasta"] = tempOutNames;
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
71                 exit(1);
72         }
73 }
74 //**********************************************************************************************************************
75
76 TrimFlowsCommand::TrimFlowsCommand(string option)  {
77         try {
78                 
79                 abort = false; calledHelp = false;   
80                 comboStarts = 0;
81                 
82                 //allow user to run help
83                 if(option == "help") { help(); abort = true; calledHelp = true; }
84                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
85                 
86                 else {
87                                                 
88                         vector<string> myArray = setParameters();
89                         
90                         OptionParser parser(option);
91                         map<string,string> parameters = parser.getParameters();
92                         
93                         ValidParameters validParameter;
94                         map<string,string>::iterator it;
95                         
96                         //check to make sure all parameters are valid for command
97                         for (it = parameters.begin(); it != parameters.end(); it++) { 
98                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
99                         }
100                         
101                         //initialize outputTypes
102                         vector<string> tempOutNames;
103                         outputTypes["flow"] = tempOutNames;
104                         outputTypes["fasta"] = tempOutNames;
105                         
106                         //if the user changes the input directory command factory will send this info to us in the output parameter 
107                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
108                         if (inputDir == "not found"){   inputDir = "";          }
109                         else {
110                                 string path;
111                                 it = parameters.find("flow");
112                                 //user has given a template file
113                                 if(it != parameters.end()){ 
114                                         path = m->hasPath(it->second);
115                                         //if the user has not given a path then, add inputdir. else leave path alone.
116                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
117                                 }
118                                 
119                                 it = parameters.find("oligos");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
125                                 }
126                                 
127                                 
128 //                              it = parameters.find("group");
129 //                              //user has given a template file
130 //                              if(it != parameters.end()){ 
131 //                                      path = m->hasPath(it->second);
132 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
133 //                                      if (path == "") {       parameters["group"] = inputDir + it->second;            }
134 //                              }
135                         }
136                         
137                         
138                         //check for required parameters
139                         flowFileName = validParameter.validFile(parameters, "flow", true);
140                         if (flowFileName == "not found") { 
141                                 flowFileName = m->getFlowFile(); 
142                                 if (flowFileName != "") {  m->mothurOut("Using " + flowFileName + " as input file for the flow parameter."); m->mothurOutEndLine(); }
143                                 else { 
144                                         m->mothurOut("No valid current flow file. You must provide a flow file."); m->mothurOutEndLine(); 
145                                         abort = true;
146                                 } 
147                         }else if (flowFileName == "not open") { flowFileName = ""; abort = true; }      
148                         
149                         //if the user changes the output directory command factory will send this info to us in the output parameter 
150                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
151                                 outputDir = ""; 
152                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
153                         }
154                         
155                         
156                         //check for optional parameter and set defaults
157                         // ...at some point should added some additional type checking...
158                         
159                         string temp;
160                         temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "360"; }
161                         convert(temp, minFlows);  
162
163                         temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "720"; }
164                         convert(temp, maxFlows);  
165                         
166                         
167                         temp = validParameter.validFile(parameters, "oligos", true);
168                         if (temp == "not found")        {       oligoFileName = "";             }
169                         else if(temp == "not open")     {       abort = true;                   } 
170                         else                                            {       oligoFileName = temp;   }
171                         
172                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
173                         else if(m->isTrue(temp))        {       fasta = 1;      }
174                         
175                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
176                         convert(temp, maxHomoP);  
177
178                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
179                         convert(temp, signal);  
180
181                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
182                         convert(temp, noise);  
183
184                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
185                         convert(temp, minLength); 
186                         
187                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
188                         convert(temp, maxLength);
189                         
190                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
191                         convert(temp, bdiffs);
192                         
193                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
194                         convert(temp, pdiffs);
195                         
196                         temp = validParameter.validFile(parameters, "tdiffs", false);
197                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
198                         convert(temp, tdiffs);
199                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
200                         
201                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found"){ temp = "T";           }
202                         allFiles = m->isTrue(temp);
203                         
204                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
205                         m->setProcessors(temp);
206                         convert(temp, processors);
207         
208                         flowOrder = validParameter.validFile(parameters, "order", false);
209                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
210                         else if(flowOrder.length() != 4){
211                                 m->mothurOut("The value of the order option must be four bases long\n");
212                         }
213
214                         if(oligoFileName == ""){        allFiles = 0;           }
215
216                         numFPrimers = 0;
217                         numRPrimers = 0;
218                 }
219                 
220         }
221         catch(exception& e) {
222                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
223                 exit(1);
224         }
225 }
226
227 //***************************************************************************************************************
228
229 int TrimFlowsCommand::execute(){
230         try{
231                 
232                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
233
234                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
235                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
236                 
237                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
238                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
239
240                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
241                 if(fasta){
242                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
243                 }
244                 
245                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
246                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
247                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
248                 }       
249
250                 vector<vector<string> > barcodePrimerComboFileNames;
251                 if(oligoFileName != ""){
252                         getOligos(barcodePrimerComboFileNames); 
253                 }
254                 
255 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
256                 if(processors == 1){
257                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
258                 }else{
259                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
260                 }       
261 #else
262                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
263 #endif
264                 
265                 if (m->control_pressed) {  return 0; }                  
266                 
267                 string flowFilesFileName;
268                 ofstream output;
269                 
270                 if(allFiles){
271                         
272                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
273                         m->openOutputFile(flowFilesFileName, output);
274
275                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
276                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
277                                         
278                                         FILE * pFile;
279                                         unsigned long int size;
280                                         
281                                         //get num bytes in file
282                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
283                                         if (pFile==NULL) perror ("Error opening file");
284                                         else{
285                                                 fseek (pFile, 0, SEEK_END);
286                                                 size=ftell (pFile);
287                                                 fclose (pFile);
288                                         }
289
290                                         if(size < 10){
291                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
292                                         }
293                                         else{
294                                                 output << barcodePrimerComboFileNames[i][j] << endl;
295                                         }
296                                 }
297                         }
298                         output.close();
299                 }
300                 else{
301                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
302                         m->openOutputFile(flowFilesFileName, output);
303                         
304                         output << trimFlowFileName << endl;
305                         
306                         output.close();
307                 }
308                 outputTypes["flow.files"].push_back(flowFilesFileName);
309                 outputNames.push_back(flowFileName);
310                 
311                 //set fasta file as new current fastafile
312                 string current = "";
313                 itTypes = outputTypes.find("fasta");
314                 if (itTypes != outputTypes.end()) {
315                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
316                 }
317                 
318                 m->mothurOutEndLine();
319                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
320                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
321                 m->mothurOutEndLine();
322                 
323                 return 0;       
324         }
325         catch(exception& e) {
326                 m->errorOut(e, "TrimSeqsCommand", "execute");
327                 exit(1);
328         }
329 }
330
331 //***************************************************************************************************************
332
333 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
334         
335         try {
336                 
337                 ofstream trimFlowFile;
338                 m->openOutputFile(trimFlowFileName, trimFlowFile);
339                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
340
341                 ofstream scrapFlowFile;
342                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
343                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
344
345                 if(line->start == 4){
346                         scrapFlowFile << maxFlows << endl;
347                         trimFlowFile << maxFlows << endl;
348                         if(allFiles){
349                                 for(int i=0;i<barcodePrimerComboFileNames.size();i++){
350                                         for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
351                                                 //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
352                                                 ofstream temp;
353                                                 m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
354                                                 temp << maxFlows << endl;
355                                                 temp.close();
356                                         }
357                                 }                       
358                         }
359                 }
360                 
361                 FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
362                 
363                 ofstream fastaFile;
364                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
365                 
366                 ifstream flowFile;
367                 m->openInputFile(flowFileName, flowFile);
368                 
369                 flowFile.seekg(line->start);
370
371                 int count = 0;
372                 bool moreSeqs = 1;
373                         
374                 while(moreSeqs) {
375                         
376                         int success = 1;
377                         int currentSeqDiffs = 0;
378                         string trashCode = "";
379                         
380                         flowData.getNext(flowFile);
381                         flowData.capFlows(maxFlows);    
382                         
383                         Sequence currSeq = flowData.getSequence();
384                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
385                                 success = 0;
386                                 trashCode += 'l';
387                         }
388                         
389                         if(minLength > 0 || maxLength > 0){     //screen to see if sequence is above and below a specific number of bases
390                                 int seqLength = currSeq.getNumBases();
391                                 if(seqLength < minLength || seqLength > maxLength){
392                                         success = 0;
393                                         trashCode += 'l';
394                                 }
395                         }
396                         
397                         int primerIndex = 0;
398                         int barcodeIndex = 0;
399                         
400                         if(barcodes.size() != 0){
401                                 success = stripBarcode(currSeq, barcodeIndex);
402                                 if(success > bdiffs)            {       trashCode += 'b';       }
403                                 else{ currentSeqDiffs += success;  }
404                         }
405                         
406                         if(numFPrimers != 0){
407                                 success = stripForward(currSeq, primerIndex);
408                                 if(success > pdiffs)            {       trashCode += 'f';       }
409                                 else{ currentSeqDiffs += success;  }
410                         }
411                         
412                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
413                         
414                         if(numRPrimers != 0){
415                                 success = stripReverse(currSeq);
416                                 if(!success)                            {       trashCode += 'r';       }
417                         }
418
419                         if(trashCode.length() == 0){
420                                                         
421                                 flowData.printFlows(trimFlowFile);
422                         
423                                 if(fasta)       {       currSeq.printSequence(fastaFile);       }
424                                 
425                                 if(allFiles){
426                                         ofstream output;
427                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
428                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
429                                         
430                                         flowData.printFlows(output);
431                                         output.close();
432                                 }                               
433                         }
434                         else{
435                                 flowData.printFlows(scrapFlowFile, trashCode);
436                         }
437                                 
438                         count++;
439                                                 
440                         //report progress
441                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
442
443 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
444                         unsigned long int pos = flowFile.tellg();
445
446                         if ((pos == -1) || (pos >= line->end)) { break; }
447 #else
448                         if (flowFile.eof()) { break; }
449 #endif
450                         
451                 }
452                 //report progress
453                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
454                 
455                 trimFlowFile.close();
456                 scrapFlowFile.close();
457                 flowFile.close();
458                 if(fasta){      fastaFile.close();      }
459                 
460                 return count;
461         }
462         catch(exception& e) {
463                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
464                 exit(1);
465         }
466 }
467
468 //***************************************************************************************************************
469
470 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
471         try {
472                 ifstream oligosFile;
473                 m->openInputFile(oligoFileName, oligosFile);
474                 
475                 string type, oligo, group;
476
477                 int indexPrimer = 0;
478                 int indexBarcode = 0;
479                 
480                 while(!oligosFile.eof()){
481                 
482                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
483
484                         if(type[0] == '#'){     //igore the line because there's a comment
485                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
486                         }
487                         else{                           //there's a feature we're interested in
488
489                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
490
491                                 oligosFile >> oligo;    //get the DNA sequence for the feature
492
493                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
494                                         oligo[i] = toupper(oligo[i]);
495                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
496                                 }
497
498                                 if(type == "FORWARD"){  //if the feature is a forward primer...
499                                         group = "";
500
501                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
502                                                 char c = oligosFile.get(); 
503                                                 if (c == 10 || c == 13){        break;  }
504                                                 else if (c == 32 || c == 9){;} //space or tab
505                                                 else {  group += c;  }
506                                         } 
507
508                                         //have we seen this primer already?
509                                         map<string, int>::iterator itPrimer = primers.find(oligo);
510                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
511
512                                         primers[oligo]=indexPrimer; indexPrimer++;
513                                         primerNameVector.push_back(group);
514
515                                 }
516                                 else if(type == "REVERSE"){
517                                         Sequence oligoRC("reverse", oligo);
518                                         oligoRC.reverseComplement();
519                                         revPrimer.push_back(oligoRC.getUnaligned());
520                                 }
521                                 else if(type == "BARCODE"){
522                                         oligosFile >> group;
523
524                                         //check for repeat barcodes
525                                         map<string, int>::iterator itBar = barcodes.find(oligo);
526                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
527
528                                         barcodes[oligo]=indexBarcode; indexBarcode++;
529                                         barcodeNameVector.push_back(group);
530                                 }
531                                 else{
532                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
533                                 }
534                         }
535
536                         m->gobble(oligosFile);
537                 }
538                 oligosFile.close();
539                 
540                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
541                 
542                 //add in potential combos
543                 if(barcodeNameVector.size() == 0){
544                         barcodes[""] = 0;
545                         barcodeNameVector.push_back("");                        
546                 }
547                 
548                 if(primerNameVector.size() == 0){
549                         primers[""] = 0;
550                         primerNameVector.push_back("");                 
551                 }
552                 
553                 
554                 outFlowFileNames.resize(barcodeNameVector.size());
555                 for(int i=0;i<outFlowFileNames.size();i++){
556                         outFlowFileNames[i].assign(primerNameVector.size(), "");
557                 }
558                 
559                 if(allFiles){
560
561                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
562                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
563
564                                         string primerName = primerNameVector[itPrimer->second];
565                                         string barcodeName = barcodeNameVector[itBar->second];
566                                                                                 
567                                         string comboGroupName = "";
568                                         string fileName = "";
569                                         
570                                         if(primerName == ""){
571                                                 comboGroupName = barcodeNameVector[itBar->second];
572                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
573                                         }
574                                         else{
575                                                 if(barcodeName == ""){
576                                                         comboGroupName = primerNameVector[itPrimer->second];
577                                                 }
578                                                 else{
579                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
580                                                 }
581                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
582                                         }
583                                         
584                                         outputNames.push_back(fileName);
585                                         outputTypes["flow"].push_back(fileName);
586                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
587                                         
588                                         ofstream temp;
589                                         m->openOutputFile(fileName, temp);
590                                         temp.close();
591                                 }
592                         }
593                 }
594                 
595                 numFPrimers = primers.size();
596                 numRPrimers = revPrimer.size();
597                 
598         }
599         catch(exception& e) {
600                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
601                 exit(1);
602         }
603 }
604
605 //***************************************************************************************************************
606
607 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
608         try {
609                 
610                 string rawSequence = seq.getUnaligned();
611                 int success = bdiffs + 1;       //guilty until proven innocent
612                 
613                 //can you find the barcode
614                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
615                         string oligo = it->first;
616                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
617                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
618                                 break;  
619                         }
620                         
621                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
622                                 group = it->second;
623                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
624                                 
625                                 success = 0;
626                                 break;
627                         }
628                 }
629                 
630                 //if you found the barcode or if you don't want to allow for diffs
631                 if ((bdiffs == 0) || (success == 0)) { return success;  }
632                 
633                 else { //try aligning and see if you can find it
634                         
635                         int maxLength = 0;
636                         
637                         Alignment* alignment;
638                         if (barcodes.size() > 0) {
639                                 map<string,int>::iterator it=barcodes.begin();
640                                 
641                                 for(it;it!=barcodes.end();it++){
642                                         if(it->first.length() > maxLength){
643                                                 maxLength = it->first.length();
644                                         }
645                                 }
646                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
647                                 
648                         }else{ alignment = NULL; } 
649                         
650                         //can you find the barcode
651                         int minDiff = 1e6;
652                         int minCount = 1;
653                         int minGroup = -1;
654                         int minPos = 0;
655                         
656                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
657                                 string oligo = it->first;
658                                 //                              int length = oligo.length();
659                                 
660                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
661                                         success = bdiffs + 10;
662                                         break;
663                                 }
664                                 
665                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
666                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
667                                 oligo = alignment->getSeqAAln();
668                                 string temp = alignment->getSeqBAln();
669                                 
670                                 int alnLength = oligo.length();
671                                 
672                                 for(int i=oligo.length()-1;i>=0;i--){
673                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
674                                 }
675                                 oligo = oligo.substr(0,alnLength);
676                                 temp = temp.substr(0,alnLength);
677                                 
678                                 int numDiff = countDiffs(oligo, temp);
679                                 
680                                 if(numDiff < minDiff){
681                                         minDiff = numDiff;
682                                         minCount = 1;
683                                         minGroup = it->second;
684                                         minPos = 0;
685                                         for(int i=0;i<alnLength;i++){
686                                                 if(temp[i] != '-'){
687                                                         minPos++;
688                                                 }
689                                         }
690                                 }
691                                 else if(numDiff == minDiff){
692                                         minCount++;
693                                 }
694                                 
695                         }
696                         
697                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
698                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
699                         else{                                                                                                   //use the best match
700                                 group = minGroup;
701                                 seq.setUnaligned(rawSequence.substr(minPos));
702                                 success = minDiff;
703                         }
704                         
705                         if (alignment != NULL) {  delete alignment;  }
706                         
707                 }
708                 
709                 return success;
710                 
711         }
712         catch(exception& e) {
713                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
714                 exit(1);
715         }
716         
717 }
718
719 //***************************************************************************************************************
720
721 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
722         try {
723                 
724                 string rawSequence = seq.getUnaligned();
725                 int success = pdiffs + 1;       //guilty until proven innocent
726
727                 //can you find the primer
728                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
729                         string oligo = it->first;
730                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
731                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
732                                 break;  
733                         }
734                         
735                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
736                                 group = it->second;
737                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
738                                 success = 0;
739                                 break;
740                         }
741                 }
742                 
743                 //if you found the barcode or if you don't want to allow for diffs
744                 if ((pdiffs == 0) || (success == 0)) {  return success;  }
745                 
746                 else { //try aligning and see if you can find it
747                         
748                         int maxLength = 0;
749                         
750                         Alignment* alignment;
751                         if (primers.size() > 0) {
752                                 map<string,int>::iterator it=primers.begin();
753                                 
754                                 for(it;it!=primers.end();it++){
755                                         if(it->first.length() > maxLength){
756                                                 maxLength = it->first.length();
757                                         }
758                                 }
759                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
760                                 
761                         }else{ alignment = NULL; } 
762                         
763                         //can you find the barcode
764                         int minDiff = 1e6;
765                         int minCount = 1;
766                         int minGroup = -1;
767                         int minPos = 0;
768                         
769                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
770                                 string oligo = it->first;
771                                 //                              int length = oligo.length();
772                                 
773                                 if(rawSequence.length() < maxLength){   
774                                         success = pdiffs + 100;
775                                         break;
776                                 }
777                                 
778                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
779                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
780                                 oligo = alignment->getSeqAAln();
781                                 string temp = alignment->getSeqBAln();
782                                 
783                                 int alnLength = oligo.length();
784                                 
785                                 for(int i=oligo.length()-1;i>=0;i--){
786                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
787                                 }
788                                 oligo = oligo.substr(0,alnLength);
789                                 temp = temp.substr(0,alnLength);
790                                 
791                                 int numDiff = countDiffs(oligo, temp);
792                                 
793                                 if(numDiff < minDiff){
794                                         minDiff = numDiff;
795                                         minCount = 1;
796                                         minGroup = it->second;
797                                         minPos = 0;
798                                         for(int i=0;i<alnLength;i++){
799                                                 if(temp[i] != '-'){
800                                                         minPos++;
801                                                 }
802                                         }
803                                 }
804                                 else if(numDiff == minDiff){
805                                         minCount++;
806                                 }
807                                 
808                         }
809                         
810                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
811                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
812                         else{                                                                                                   //use the best match
813                                 group = minGroup;
814                                 seq.setUnaligned(rawSequence.substr(minPos));
815                                 success = minDiff;
816                         }
817                         
818                         if (alignment != NULL) {  delete alignment;  }
819                         
820                 }
821                 
822                 return success;
823                 
824         }
825         catch(exception& e) {
826                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
827                 exit(1);
828         }
829 }
830
831 //***************************************************************************************************************
832
833 bool TrimFlowsCommand::stripReverse(Sequence& seq){
834         try {
835
836                 string rawSequence = seq.getUnaligned();
837                 bool success = 0;       //guilty until proven innocent
838                 
839                 for(int i=0;i<numRPrimers;i++){
840                         string oligo = revPrimer[i];
841                         
842                         if(rawSequence.length() < oligo.length()){
843                                 success = 0;
844                                 break;
845                         }
846                         
847                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
848                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
849                                 success = 1;
850                                 break;
851                         }
852                 }       
853
854                 return success;
855                 
856         }
857         catch(exception& e) {
858                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
859                 exit(1);
860         }
861 }
862
863
864 //***************************************************************************************************************
865
866 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
867         try {
868                 bool success = 1;
869                 int length = oligo.length();
870                 
871                 for(int i=0;i<length;i++){
872                         
873                         if(oligo[i] != seq[i]){
874                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
875                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
876                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
877                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
878                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
879                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
880                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
881                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
882                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
883                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
884                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
885                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
886                                 
887                                 if(success == 0)        {       break;   }
888                         }
889                         else{
890                                 success = 1;
891                         }
892                 }
893                 
894                 return success;
895         }
896         catch(exception& e) {
897                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
898                 exit(1);
899         }
900         
901 }
902
903 //***************************************************************************************************************
904
905 int TrimFlowsCommand::countDiffs(string oligo, string seq){
906         try {
907                 
908                 int length = oligo.length();
909                 int countDiffs = 0;
910                 
911                 for(int i=0;i<length;i++){
912                         
913                         if(oligo[i] != seq[i]){
914                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
915                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
916                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
917                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
918                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
919                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
920                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
921                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
922                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
923                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
924                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
925                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
926                         }
927                         
928                 }
929                 
930                 return countDiffs;
931         }
932         catch(exception& e) {
933                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
934                 exit(1);
935         }
936         
937 }
938
939 /**************************************************************************************************/
940
941 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
942
943         try{
944                         
945                 vector<unsigned long int> filePos;
946                 filePos.push_back(0);
947                                         
948                 FILE * pFile;
949                 unsigned long int size;
950                 
951                 //get num bytes in file
952                 pFile = fopen (flowFileName.c_str(),"rb");
953                 if (pFile==NULL) perror ("Error opening file");
954                 else{
955                         fseek (pFile, 0, SEEK_END);
956                         size=ftell (pFile);
957                         fclose (pFile);
958                 }
959                                 
960                 //estimate file breaks
961                 unsigned long int chunkSize = 0;
962                 chunkSize = size / processors;
963
964                 //file too small to divide by processors
965                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
966                 
967                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
968                 for (int i = 0; i < processors; i++) {
969                         unsigned long int spot = (i+1) * chunkSize;
970                         
971                         ifstream in;
972                         m->openInputFile(flowFileName, in);
973                         in.seekg(spot);
974                         
975                         string dummy = m->getline(in);
976                         
977                         //there was not another sequence before the end of the file
978                         unsigned long int sanityPos = in.tellg();
979                         
980 //                      if (sanityPos == -1) {  break;  }
981 //                      else {  filePos.push_back(newSpot);  }
982                         if (sanityPos == -1) {  break;  }
983                         else {  filePos.push_back(sanityPos);  }
984                         
985                         in.close();
986                 }
987                 
988                 //save end pos
989                 filePos.push_back(size);
990                 
991                 //sanity check filePos
992                 for (int i = 0; i < (filePos.size()-1); i++) {
993                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
994                 }
995
996                 ifstream in;
997                 m->openInputFile(flowFileName, in);
998                 in >> numFlows;
999                 m->gobble(in);
1000                 unsigned long int spot = in.tellg();
1001                 filePos[0] = spot;
1002                 in.close();
1003                 
1004                 processors = (filePos.size() - 1);
1005                 
1006                 return filePos; 
1007         }
1008         catch(exception& e) {
1009                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
1010                 exit(1);
1011         }
1012 }
1013
1014 /**************************************************************************************************/
1015
1016 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
1017
1018         try {
1019 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1020                 int process = 1;
1021                 int exitCommand = 1;
1022                 processIDS.clear();
1023                 
1024                 //loop through and create all the processes you want
1025                 while (process != processors) {
1026                         int pid = fork();
1027                         
1028                         if (pid > 0) {
1029                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1030                                 process++;
1031                         }else if (pid == 0){
1032                                 
1033                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1034                                 if(allFiles){
1035                                         for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1036                                                 for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1037                                                         tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1038                                                         ofstream temp;
1039                                                         m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1040                                                         temp.close();
1041                                                         
1042                                                 }
1043                                         }
1044                                 }
1045                                 driverCreateTrim(flowFileName,
1046                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1047                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1048                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1049                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1050
1051                                 exit(0);
1052                         }else { 
1053                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1054                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1055                                 exit(0);
1056                         }
1057                 }
1058                 
1059                 //parent do my part
1060                 ofstream temp;
1061                 m->openOutputFile(trimFlowFileName, temp);
1062                 temp.close();
1063
1064                 m->openOutputFile(scrapFlowFileName, temp);
1065                 temp.close();
1066                 
1067                 if(fasta){
1068                         m->openOutputFile(fastaFileName, temp);
1069                         temp.close();
1070                 }
1071                 
1072                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1073
1074                 //force parent to wait until all the processes are done
1075                 for (int i=0;i<processIDS.size();i++) { 
1076                         int temp = processIDS[i];
1077                         wait(&temp);
1078                 }
1079                 
1080                 //append files
1081                 m->mothurOutEndLine();
1082                 for(int i=0;i<processIDS.size();i++){
1083                         
1084                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1085                         
1086                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1087                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1088 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1089
1090                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1091                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1092 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1093
1094                         if(fasta){
1095                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1096                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1097 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1098                         }
1099                         if(allFiles){                                           
1100                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1101                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1102                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1103                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1104                                         }
1105                                 }
1106                         }
1107                 }
1108                 
1109                 return exitCommand;
1110 #endif          
1111         }
1112         catch(exception& e) {
1113                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1114                 exit(1);
1115         }
1116 }
1117
1118 //***************************************************************************************************************