]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
removed read.dist, read.otu, read.tree and globaldata. added current to defaults...
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> TrimFlowsCommand::setParameters(){       
15         try {
16                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pflow);
17                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18                 CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop);
19                 CommandParameter pmaxflows("maxflows", "Number", "", "720", "", "", "",false,false); parameters.push_back(pmaxflows);
20                 CommandParameter pminflows("minflows", "Number", "", "360", "", "", "",false,false); parameters.push_back(pminflows);
21                 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
22                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
23                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
25                 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27                 CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
28                 CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
29                 CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
30                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
31                 CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                 
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "TrimFlowsCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string TrimFlowsCommand::getHelpString(){       
46         try {
47                 string helpString = "";
48                 helpString += "The trim.flows command reads a flowgram file and creates .....\n";
49                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
50                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n\n";
51                 return helpString;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "TrimFlowsCommand", "getHelpString");
55                 exit(1);
56         }
57 }
58
59 //**********************************************************************************************************************
60
61 TrimFlowsCommand::TrimFlowsCommand(){   
62         try {
63                 abort = true; calledHelp = true; 
64                 setParameters();
65                 vector<string> tempOutNames;
66                 outputTypes["flow"] = tempOutNames;
67                 outputTypes["fasta"] = tempOutNames;
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
71                 exit(1);
72         }
73 }
74 //**********************************************************************************************************************
75
76 TrimFlowsCommand::TrimFlowsCommand(string option)  {
77         try {
78                 
79                 abort = false; calledHelp = false;   
80                 comboStarts = 0;
81                 
82                 //allow user to run help
83                 if(option == "help") { help(); abort = true; calledHelp = true; }
84                 
85                 else {
86                                                 
87                         vector<string> myArray = setParameters();
88                         
89                         OptionParser parser(option);
90                         map<string,string> parameters = parser.getParameters();
91                         
92                         ValidParameters validParameter;
93                         map<string,string>::iterator it;
94                         
95                         //check to make sure all parameters are valid for command
96                         for (it = parameters.begin(); it != parameters.end(); it++) { 
97                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
98                         }
99                         
100                         //initialize outputTypes
101                         vector<string> tempOutNames;
102                         outputTypes["flow"] = tempOutNames;
103                         outputTypes["fasta"] = tempOutNames;
104                         
105                         //if the user changes the input directory command factory will send this info to us in the output parameter 
106                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
107                         if (inputDir == "not found"){   inputDir = "";          }
108                         else {
109                                 string path;
110                                 it = parameters.find("flow");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = m->hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
116                                 }
117                                 
118                                 it = parameters.find("oligos");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
124                                 }
125                                 
126                                 
127 //                              it = parameters.find("group");
128 //                              //user has given a template file
129 //                              if(it != parameters.end()){ 
130 //                                      path = m->hasPath(it->second);
131 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
132 //                                      if (path == "") {       parameters["group"] = inputDir + it->second;            }
133 //                              }
134                         }
135                         
136                         
137                         //check for required parameters
138                         flowFileName = validParameter.validFile(parameters, "flow", true);
139                         if (flowFileName == "not found") { m->mothurOut("flow is a required parameter for the trim.flows command."); m->mothurOutEndLine(); abort = true; }
140                         else if (flowFileName == "not open") { abort = true; }  
141                         
142                         //if the user changes the output directory command factory will send this info to us in the output parameter 
143                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
144                                 outputDir = ""; 
145                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
146                         }
147                         
148                         
149                         //check for optional parameter and set defaults
150                         // ...at some point should added some additional type checking...
151                         
152                         string temp;
153                         temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "360"; }
154                         convert(temp, minFlows);  
155
156                         temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "720"; }
157                         convert(temp, maxFlows);  
158                         
159                         
160                         temp = validParameter.validFile(parameters, "oligos", true);
161                         if (temp == "not found")        {       oligoFileName = "";             }
162                         else if(temp == "not open")     {       abort = true;                   } 
163                         else                                            {       oligoFileName = temp;   }
164                         
165                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
166                         else if(m->isTrue(temp))        {       fasta = 1;      }
167                         
168                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
169                         convert(temp, maxHomoP);  
170
171                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
172                         convert(temp, signal);  
173
174                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
175                         convert(temp, noise);  
176
177                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
178                         convert(temp, minLength); 
179                         
180                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
181                         convert(temp, maxLength);
182                         
183                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
184                         convert(temp, bdiffs);
185                         
186                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
187                         convert(temp, pdiffs);
188                         
189                         temp = validParameter.validFile(parameters, "tdiffs", false);
190                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
191                         convert(temp, tdiffs);
192                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
193                         
194                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found"){ temp = "T";           }
195                         allFiles = m->isTrue(temp);
196                         
197                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
198                         m->setProcessors(temp);
199                         convert(temp, processors);
200         
201                         flowOrder = validParameter.validFile(parameters, "order", false);
202                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
203                         else if(flowOrder.length() != 4){
204                                 m->mothurOut("The value of the order option must be four bases long\n");
205                         }
206
207                         if(oligoFileName == ""){        allFiles = 0;           }
208
209                         numFPrimers = 0;
210                         numRPrimers = 0;
211                 }
212                 
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
216                 exit(1);
217         }
218 }
219
220 //***************************************************************************************************************
221
222 int TrimFlowsCommand::execute(){
223         try{
224                 
225                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
226
227                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
228                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
229                 
230                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
231                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
232
233                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
234                 if(fasta){
235                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
236                 }
237                 
238                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
239                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
240                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
241                 }       
242
243                 vector<vector<string> > barcodePrimerComboFileNames;
244                 if(oligoFileName != ""){
245                         getOligos(barcodePrimerComboFileNames); 
246                 }
247                 
248 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
249                 if(processors == 1){
250                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
251                 }else{
252                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
253                 }       
254 #else
255                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
256 #endif
257                 
258                 if (m->control_pressed) {  return 0; }                  
259                 
260                 string flowFilesFileName;
261                 ofstream output;
262                 
263                 if(allFiles){
264                         
265                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
266                         m->openOutputFile(flowFilesFileName, output);
267
268                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
269                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
270                                         
271                                         FILE * pFile;
272                                         unsigned long int size;
273                                         
274                                         //get num bytes in file
275                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
276                                         if (pFile==NULL) perror ("Error opening file");
277                                         else{
278                                                 fseek (pFile, 0, SEEK_END);
279                                                 size=ftell (pFile);
280                                                 fclose (pFile);
281                                         }
282
283                                         if(size < 10){
284                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
285                                         }
286                                         else{
287                                                 output << barcodePrimerComboFileNames[i][j] << endl;
288                                         }
289                                 }
290                         }
291                         output.close();
292                 }
293                 else{
294                         flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
295                         m->openOutputFile(flowFilesFileName, output);
296                         
297                         output << trimFlowFileName << endl;
298                         
299                         output.close();
300                 }
301                 outputTypes["flow.files"].push_back(flowFilesFileName);
302                 outputNames.push_back(flowFileName);
303                 
304                 //set fasta file as new current fastafile
305                 string current = "";
306                 itTypes = outputTypes.find("fasta");
307                 if (itTypes != outputTypes.end()) {
308                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
309                 }
310                 
311                 m->mothurOutEndLine();
312                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
313                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
314                 m->mothurOutEndLine();
315                 
316                 return 0;       
317         }
318         catch(exception& e) {
319                 m->errorOut(e, "TrimSeqsCommand", "execute");
320                 exit(1);
321         }
322 }
323
324 //***************************************************************************************************************
325
326 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
327         
328         try {
329                 
330                 ofstream trimFlowFile;
331                 m->openOutputFile(trimFlowFileName, trimFlowFile);
332                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
333
334                 ofstream scrapFlowFile;
335                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
336                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
337
338                 if(line->start == 4){
339                         scrapFlowFile << maxFlows << endl;
340                         trimFlowFile << maxFlows << endl;
341                         if(allFiles){
342                                 for(int i=0;i<barcodePrimerComboFileNames.size();i++){
343                                         for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
344                                                 //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
345                                                 ofstream temp;
346                                                 m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
347                                                 temp << maxFlows << endl;
348                                                 temp.close();
349                                         }
350                                 }                       
351                         }
352                 }
353                 
354                 FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
355                 
356                 ofstream fastaFile;
357                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
358                 
359                 ifstream flowFile;
360                 m->openInputFile(flowFileName, flowFile);
361                 
362                 flowFile.seekg(line->start);
363
364                 int count = 0;
365                 bool moreSeqs = 1;
366                         
367                 while(moreSeqs) {
368                         
369                         int success = 1;
370                         int currentSeqDiffs = 0;
371                         string trashCode = "";
372                         
373                         flowData.getNext(flowFile);
374                         flowData.capFlows(maxFlows);    
375                         
376                         Sequence currSeq = flowData.getSequence();
377                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
378                                 success = 0;
379                                 trashCode += 'l';
380                         }
381                         
382                         if(minLength > 0 || maxLength > 0){     //screen to see if sequence is above and below a specific number of bases
383                                 int seqLength = currSeq.getNumBases();
384                                 if(seqLength < minLength || seqLength > maxLength){
385                                         success = 0;
386                                         trashCode += 'l';
387                                 }
388                         }
389                         
390                         int primerIndex = 0;
391                         int barcodeIndex = 0;
392                         
393                         if(barcodes.size() != 0){
394                                 success = stripBarcode(currSeq, barcodeIndex);
395                                 if(success > bdiffs)            {       trashCode += 'b';       }
396                                 else{ currentSeqDiffs += success;  }
397                         }
398                         
399                         if(numFPrimers != 0){
400                                 success = stripForward(currSeq, primerIndex);
401                                 if(success > pdiffs)            {       trashCode += 'f';       }
402                                 else{ currentSeqDiffs += success;  }
403                         }
404                         
405                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
406                         
407                         if(numRPrimers != 0){
408                                 success = stripReverse(currSeq);
409                                 if(!success)                            {       trashCode += 'r';       }
410                         }
411
412                         if(trashCode.length() == 0){
413                                                         
414                                 flowData.printFlows(trimFlowFile);
415                         
416                                 if(fasta)       {       currSeq.printSequence(fastaFile);       }
417                                 
418                                 if(allFiles){
419                                         ofstream output;
420                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
421                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
422                                         
423                                         flowData.printFlows(output);
424                                         output.close();
425                                 }                               
426                         }
427                         else{
428                                 flowData.printFlows(scrapFlowFile, trashCode);
429                         }
430                                 
431                         count++;
432                                                 
433                         //report progress
434                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
435
436 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
437                         unsigned long int pos = flowFile.tellg();
438
439                         if ((pos == -1) || (pos >= line->end)) { break; }
440 #else
441                         if (flowFile.eof()) { break; }
442 #endif
443                         
444                 }
445                 //report progress
446                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
447                 
448                 trimFlowFile.close();
449                 scrapFlowFile.close();
450                 flowFile.close();
451                 if(fasta){      fastaFile.close();      }
452                 
453                 return count;
454         }
455         catch(exception& e) {
456                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
457                 exit(1);
458         }
459 }
460
461 //***************************************************************************************************************
462
463 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
464         try {
465                 ifstream oligosFile;
466                 m->openInputFile(oligoFileName, oligosFile);
467                 
468                 string type, oligo, group;
469
470                 int indexPrimer = 0;
471                 int indexBarcode = 0;
472                 
473                 while(!oligosFile.eof()){
474                 
475                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
476
477                         if(type[0] == '#'){     //igore the line because there's a comment
478                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
479                         }
480                         else{                           //there's a feature we're interested in
481
482                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
483
484                                 oligosFile >> oligo;    //get the DNA sequence for the feature
485
486                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
487                                         oligo[i] = toupper(oligo[i]);
488                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
489                                 }
490
491                                 if(type == "FORWARD"){  //if the feature is a forward primer...
492                                         group = "";
493
494                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
495                                                 char c = oligosFile.get(); 
496                                                 if (c == 10 || c == 13){        break;  }
497                                                 else if (c == 32 || c == 9){;} //space or tab
498                                                 else {  group += c;  }
499                                         } 
500
501                                         //have we seen this primer already?
502                                         map<string, int>::iterator itPrimer = primers.find(oligo);
503                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
504
505                                         primers[oligo]=indexPrimer; indexPrimer++;
506                                         primerNameVector.push_back(group);
507
508                                 }
509                                 else if(type == "REVERSE"){
510                                         Sequence oligoRC("reverse", oligo);
511                                         oligoRC.reverseComplement();
512                                         revPrimer.push_back(oligoRC.getUnaligned());
513                                 }
514                                 else if(type == "BARCODE"){
515                                         oligosFile >> group;
516
517                                         //check for repeat barcodes
518                                         map<string, int>::iterator itBar = barcodes.find(oligo);
519                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
520
521                                         barcodes[oligo]=indexBarcode; indexBarcode++;
522                                         barcodeNameVector.push_back(group);
523                                 }
524                                 else{
525                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
526                                 }
527                         }
528
529                         m->gobble(oligosFile);
530                 }
531                 oligosFile.close();
532                 
533                 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
534                 
535                 //add in potential combos
536                 if(barcodeNameVector.size() == 0){
537                         barcodes[""] = 0;
538                         barcodeNameVector.push_back("");                        
539                 }
540                 
541                 if(primerNameVector.size() == 0){
542                         primers[""] = 0;
543                         primerNameVector.push_back("");                 
544                 }
545                 
546                 
547                 outFlowFileNames.resize(barcodeNameVector.size());
548                 for(int i=0;i<outFlowFileNames.size();i++){
549                         outFlowFileNames[i].assign(primerNameVector.size(), "");
550                 }
551                 
552                 if(allFiles){
553
554                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
555                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
556
557                                         string primerName = primerNameVector[itPrimer->second];
558                                         string barcodeName = barcodeNameVector[itBar->second];
559                                                                                 
560                                         string comboGroupName = "";
561                                         string fileName = "";
562                                         
563                                         if(primerName == ""){
564                                                 comboGroupName = barcodeNameVector[itBar->second];
565                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
566                                         }
567                                         else{
568                                                 if(barcodeName == ""){
569                                                         comboGroupName = primerNameVector[itPrimer->second];
570                                                 }
571                                                 else{
572                                                         comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
573                                                 }
574                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
575                                         }
576                                         
577                                         outputNames.push_back(fileName);
578                                         outputTypes["flow"].push_back(fileName);
579                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
580                                         
581                                         ofstream temp;
582                                         m->openOutputFile(fileName, temp);
583                                         temp.close();
584                                 }
585                         }
586                 }
587                 
588                 numFPrimers = primers.size();
589                 numRPrimers = revPrimer.size();
590                 
591         }
592         catch(exception& e) {
593                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
594                 exit(1);
595         }
596 }
597
598 //***************************************************************************************************************
599
600 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
601         try {
602                 
603                 string rawSequence = seq.getUnaligned();
604                 int success = bdiffs + 1;       //guilty until proven innocent
605                 
606                 //can you find the barcode
607                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
608                         string oligo = it->first;
609                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
610                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
611                                 break;  
612                         }
613                         
614                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
615                                 group = it->second;
616                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
617                                 
618                                 success = 0;
619                                 break;
620                         }
621                 }
622                 
623                 //if you found the barcode or if you don't want to allow for diffs
624                 if ((bdiffs == 0) || (success == 0)) { return success;  }
625                 
626                 else { //try aligning and see if you can find it
627                         
628                         int maxLength = 0;
629                         
630                         Alignment* alignment;
631                         if (barcodes.size() > 0) {
632                                 map<string,int>::iterator it=barcodes.begin();
633                                 
634                                 for(it;it!=barcodes.end();it++){
635                                         if(it->first.length() > maxLength){
636                                                 maxLength = it->first.length();
637                                         }
638                                 }
639                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
640                                 
641                         }else{ alignment = NULL; } 
642                         
643                         //can you find the barcode
644                         int minDiff = 1e6;
645                         int minCount = 1;
646                         int minGroup = -1;
647                         int minPos = 0;
648                         
649                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
650                                 string oligo = it->first;
651                                 //                              int length = oligo.length();
652                                 
653                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
654                                         success = bdiffs + 10;
655                                         break;
656                                 }
657                                 
658                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
659                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
660                                 oligo = alignment->getSeqAAln();
661                                 string temp = alignment->getSeqBAln();
662                                 
663                                 int alnLength = oligo.length();
664                                 
665                                 for(int i=oligo.length()-1;i>=0;i--){
666                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
667                                 }
668                                 oligo = oligo.substr(0,alnLength);
669                                 temp = temp.substr(0,alnLength);
670                                 
671                                 int numDiff = countDiffs(oligo, temp);
672                                 
673                                 if(numDiff < minDiff){
674                                         minDiff = numDiff;
675                                         minCount = 1;
676                                         minGroup = it->second;
677                                         minPos = 0;
678                                         for(int i=0;i<alnLength;i++){
679                                                 if(temp[i] != '-'){
680                                                         minPos++;
681                                                 }
682                                         }
683                                 }
684                                 else if(numDiff == minDiff){
685                                         minCount++;
686                                 }
687                                 
688                         }
689                         
690                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
691                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
692                         else{                                                                                                   //use the best match
693                                 group = minGroup;
694                                 seq.setUnaligned(rawSequence.substr(minPos));
695                                 success = minDiff;
696                         }
697                         
698                         if (alignment != NULL) {  delete alignment;  }
699                         
700                 }
701                 
702                 return success;
703                 
704         }
705         catch(exception& e) {
706                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
707                 exit(1);
708         }
709         
710 }
711
712 //***************************************************************************************************************
713
714 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
715         try {
716                 
717                 string rawSequence = seq.getUnaligned();
718                 int success = pdiffs + 1;       //guilty until proven innocent
719
720                 //can you find the primer
721                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
722                         string oligo = it->first;
723                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
724                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
725                                 break;  
726                         }
727                         
728                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
729                                 group = it->second;
730                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
731                                 success = 0;
732                                 break;
733                         }
734                 }
735                 
736                 //if you found the barcode or if you don't want to allow for diffs
737                 if ((pdiffs == 0) || (success == 0)) {  return success;  }
738                 
739                 else { //try aligning and see if you can find it
740                         
741                         int maxLength = 0;
742                         
743                         Alignment* alignment;
744                         if (primers.size() > 0) {
745                                 map<string,int>::iterator it=primers.begin();
746                                 
747                                 for(it;it!=primers.end();it++){
748                                         if(it->first.length() > maxLength){
749                                                 maxLength = it->first.length();
750                                         }
751                                 }
752                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
753                                 
754                         }else{ alignment = NULL; } 
755                         
756                         //can you find the barcode
757                         int minDiff = 1e6;
758                         int minCount = 1;
759                         int minGroup = -1;
760                         int minPos = 0;
761                         
762                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
763                                 string oligo = it->first;
764                                 //                              int length = oligo.length();
765                                 
766                                 if(rawSequence.length() < maxLength){   
767                                         success = pdiffs + 100;
768                                         break;
769                                 }
770                                 
771                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
772                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
773                                 oligo = alignment->getSeqAAln();
774                                 string temp = alignment->getSeqBAln();
775                                 
776                                 int alnLength = oligo.length();
777                                 
778                                 for(int i=oligo.length()-1;i>=0;i--){
779                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
780                                 }
781                                 oligo = oligo.substr(0,alnLength);
782                                 temp = temp.substr(0,alnLength);
783                                 
784                                 int numDiff = countDiffs(oligo, temp);
785                                 
786                                 if(numDiff < minDiff){
787                                         minDiff = numDiff;
788                                         minCount = 1;
789                                         minGroup = it->second;
790                                         minPos = 0;
791                                         for(int i=0;i<alnLength;i++){
792                                                 if(temp[i] != '-'){
793                                                         minPos++;
794                                                 }
795                                         }
796                                 }
797                                 else if(numDiff == minDiff){
798                                         minCount++;
799                                 }
800                                 
801                         }
802                         
803                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
804                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
805                         else{                                                                                                   //use the best match
806                                 group = minGroup;
807                                 seq.setUnaligned(rawSequence.substr(minPos));
808                                 success = minDiff;
809                         }
810                         
811                         if (alignment != NULL) {  delete alignment;  }
812                         
813                 }
814                 
815                 return success;
816                 
817         }
818         catch(exception& e) {
819                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
820                 exit(1);
821         }
822 }
823
824 //***************************************************************************************************************
825
826 bool TrimFlowsCommand::stripReverse(Sequence& seq){
827         try {
828
829                 string rawSequence = seq.getUnaligned();
830                 bool success = 0;       //guilty until proven innocent
831                 
832                 for(int i=0;i<numRPrimers;i++){
833                         string oligo = revPrimer[i];
834                         
835                         if(rawSequence.length() < oligo.length()){
836                                 success = 0;
837                                 break;
838                         }
839                         
840                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
841                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
842                                 success = 1;
843                                 break;
844                         }
845                 }       
846
847                 return success;
848                 
849         }
850         catch(exception& e) {
851                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
852                 exit(1);
853         }
854 }
855
856
857 //***************************************************************************************************************
858
859 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
860         try {
861                 bool success = 1;
862                 int length = oligo.length();
863                 
864                 for(int i=0;i<length;i++){
865                         
866                         if(oligo[i] != seq[i]){
867                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
868                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
869                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
870                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
871                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
872                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
873                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
874                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
875                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
876                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
877                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
878                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
879                                 
880                                 if(success == 0)        {       break;   }
881                         }
882                         else{
883                                 success = 1;
884                         }
885                 }
886                 
887                 return success;
888         }
889         catch(exception& e) {
890                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
891                 exit(1);
892         }
893         
894 }
895
896 //***************************************************************************************************************
897
898 int TrimFlowsCommand::countDiffs(string oligo, string seq){
899         try {
900                 
901                 int length = oligo.length();
902                 int countDiffs = 0;
903                 
904                 for(int i=0;i<length;i++){
905                         
906                         if(oligo[i] != seq[i]){
907                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
908                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
909                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
910                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
911                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
912                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
913                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
914                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
915                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
916                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
917                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
918                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
919                         }
920                         
921                 }
922                 
923                 return countDiffs;
924         }
925         catch(exception& e) {
926                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
927                 exit(1);
928         }
929         
930 }
931
932 /**************************************************************************************************/
933
934 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
935
936         try{
937                         
938                 vector<unsigned long int> filePos;
939                 filePos.push_back(0);
940                                         
941                 FILE * pFile;
942                 unsigned long int size;
943                 
944                 //get num bytes in file
945                 pFile = fopen (flowFileName.c_str(),"rb");
946                 if (pFile==NULL) perror ("Error opening file");
947                 else{
948                         fseek (pFile, 0, SEEK_END);
949                         size=ftell (pFile);
950                         fclose (pFile);
951                 }
952                                 
953                 //estimate file breaks
954                 unsigned long int chunkSize = 0;
955                 chunkSize = size / processors;
956
957                 //file too small to divide by processors
958                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
959                 
960                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
961                 for (int i = 0; i < processors; i++) {
962                         unsigned long int spot = (i+1) * chunkSize;
963                         
964                         ifstream in;
965                         m->openInputFile(flowFileName, in);
966                         in.seekg(spot);
967                         
968                         string dummy = m->getline(in);
969                         
970                         //there was not another sequence before the end of the file
971                         unsigned long int sanityPos = in.tellg();
972                         
973 //                      if (sanityPos == -1) {  break;  }
974 //                      else {  filePos.push_back(newSpot);  }
975                         if (sanityPos == -1) {  break;  }
976                         else {  filePos.push_back(sanityPos);  }
977                         
978                         in.close();
979                 }
980                 
981                 //save end pos
982                 filePos.push_back(size);
983                 
984                 //sanity check filePos
985                 for (int i = 0; i < (filePos.size()-1); i++) {
986                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
987                 }
988
989                 ifstream in;
990                 m->openInputFile(flowFileName, in);
991                 in >> numFlows;
992                 m->gobble(in);
993                 unsigned long int spot = in.tellg();
994                 filePos[0] = spot;
995                 in.close();
996                 
997                 processors = (filePos.size() - 1);
998                 
999                 return filePos; 
1000         }
1001         catch(exception& e) {
1002                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
1003                 exit(1);
1004         }
1005 }
1006
1007 /**************************************************************************************************/
1008
1009 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
1010
1011         try {
1012 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1013                 int process = 1;
1014                 int exitCommand = 1;
1015                 processIDS.clear();
1016                 
1017                 //loop through and create all the processes you want
1018                 while (process != processors) {
1019                         int pid = fork();
1020                         
1021                         if (pid > 0) {
1022                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1023                                 process++;
1024                         }else if (pid == 0){
1025                                 
1026                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1027                                 if(allFiles){
1028                                         for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1029                                                 for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1030                                                         tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1031                                                         ofstream temp;
1032                                                         m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1033                                                         temp.close();
1034                                                         
1035                                                 }
1036                                         }
1037                                 }
1038                                 driverCreateTrim(flowFileName,
1039                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1040                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1041                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1042                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1043
1044                                 exit(0);
1045                         }else { 
1046                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1047                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1048                                 exit(0);
1049                         }
1050                 }
1051                 
1052                 //parent do my part
1053                 ofstream temp;
1054                 m->openOutputFile(trimFlowFileName, temp);
1055                 temp.close();
1056
1057                 m->openOutputFile(scrapFlowFileName, temp);
1058                 temp.close();
1059                 
1060                 if(fasta){
1061                         m->openOutputFile(fastaFileName, temp);
1062                         temp.close();
1063                 }
1064                 
1065                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1066
1067                 //force parent to wait until all the processes are done
1068                 for (int i=0;i<processIDS.size();i++) { 
1069                         int temp = processIDS[i];
1070                         wait(&temp);
1071                 }
1072                 
1073                 //append files
1074                 m->mothurOutEndLine();
1075                 for(int i=0;i<processIDS.size();i++){
1076                         
1077                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1078                         
1079                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1080                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1081 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1082
1083                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1084                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1085 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1086
1087                         if(fasta){
1088                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1089                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1090 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1091                         }
1092                         if(allFiles){                                           
1093                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1094                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1095                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1096                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1097                                         }
1098                                 }
1099                         }
1100                 }
1101                 
1102                 return exitCommand;
1103 #endif          
1104         }
1105         catch(exception& e) {
1106                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1107                 exit(1);
1108         }
1109 }
1110
1111 //***************************************************************************************************************