]> git.donarmstrong.com Git - mothur.git/blob - trimflowscommand.cpp
added [ERROR] flag if command aborts
[mothur.git] / trimflowscommand.cpp
1 /*
2  *  trimflowscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/22/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimflowscommand.h"
11 #include "needlemanoverlap.hpp"
12
13 //**********************************************************************************************************************
14
15 vector<string> TrimFlowsCommand::getValidParameters(){  
16         try {
17                 string Array[] =  {"flow", "totalflows", "minflows",
18                         "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise"
19                         "oligos", "pdiffs", "bdiffs", "tdiffs", 
20                         "allfiles", "processors",
21
22
23                         
24 //                      "group",
25                         "outputdir","inputdir"
26                 
27                 };
28                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "TrimFlowsCommand", "getValidParameters");
33                 exit(1);
34         }
35 }
36
37 //**********************************************************************************************************************
38
39 vector<string> TrimFlowsCommand::getRequiredParameters(){       
40         try {
41                 string Array[] =  {"flow"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "TrimFlowsCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50
51 //**********************************************************************************************************************
52
53 vector<string> TrimFlowsCommand::getRequiredFiles(){    
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "TrimFlowsCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 TrimFlowsCommand::TrimFlowsCommand(){   
67         try {
68                 abort = true; calledHelp = true; 
69                 vector<string> tempOutNames;
70                 outputTypes["flow"] = tempOutNames;
71                 outputTypes["fasta"] = tempOutNames;
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand");
75                 exit(1);
76         }
77 }
78
79 //***************************************************************************************************************
80
81 TrimFlowsCommand::~TrimFlowsCommand(){  /*      do nothing      */      }
82
83 //***************************************************************************************************************
84
85 void TrimFlowsCommand::help(){
86         try {
87                 m->mothurOut("The trim.flows command reads a flowgram file and creates .....\n");
88                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
89                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.flows.\n\n");
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "TrimFlowsCommand", "help");
94                 exit(1);
95         }
96 }
97
98 //**********************************************************************************************************************
99
100 TrimFlowsCommand::TrimFlowsCommand(string option)  {
101         try {
102                 
103                 abort = false; calledHelp = false;   
104                 comboStarts = 0;
105                 
106                 //allow user to run help
107                 if(option == "help") { help(); abort = true; calledHelp = true; }
108                 
109                 else {
110                         //valid paramters for this command
111                         string AlignArray[] =  {"flow", "totalflows", "minflows",
112                                 "fasta", "minlength", "maxlength", "maxhomop", "signal", "noise",
113                                 "oligos", "pdiffs", "bdiffs", "tdiffs", 
114                                 "allfiles", "processors",
115                 
116                                 //                      "group",
117                                 "outputdir","inputdir"
118                                 
119                         };
120                         
121                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
122                         
123                         OptionParser parser(option);
124                         map<string,string> parameters = parser.getParameters();
125                         
126                         ValidParameters validParameter;
127                         map<string,string>::iterator it;
128                         
129                         //check to make sure all parameters are valid for command
130                         for (it = parameters.begin(); it != parameters.end(); it++) { 
131                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
132                         }
133                         
134                         //initialize outputTypes
135                         vector<string> tempOutNames;
136                         outputTypes["flow"] = tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         
139                         //if the user changes the input directory command factory will send this info to us in the output parameter 
140                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
141                         if (inputDir == "not found"){   inputDir = "";          }
142                         else {
143                                 string path;
144                                 it = parameters.find("flow");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
150                                 }
151                                 
152                                 it = parameters.find("oligos");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
158                                 }
159                                 
160                                 
161 //                              it = parameters.find("group");
162 //                              //user has given a template file
163 //                              if(it != parameters.end()){ 
164 //                                      path = m->hasPath(it->second);
165 //                                      //if the user has not given a path then, add inputdir. else leave path alone.
166 //                                      if (path == "") {       parameters["group"] = inputDir + it->second;            }
167 //                              }
168                         }
169                         
170                         
171                         //check for required parameters
172                         flowFileName = validParameter.validFile(parameters, "flow", true);
173                         if (flowFileName == "not found") { m->mothurOut("flow is a required parameter for the trim.flows command."); m->mothurOutEndLine(); abort = true; }
174                         else if (flowFileName == "not open") { abort = true; }  
175                         
176                         //if the user changes the output directory command factory will send this info to us in the output parameter 
177                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
178                                 outputDir = ""; 
179                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
180                         }
181                         
182                         
183                         //check for optional parameter and set defaults
184                         // ...at some point should added some additional type checking...
185                         
186                         string temp;
187                         temp = validParameter.validFile(parameters, "minflows", false);         if (temp == "not found") { temp = "360"; }
188                         convert(temp, minFlows);  
189
190                         temp = validParameter.validFile(parameters, "totalflows", false);       if (temp == "not found") { temp = "720"; }
191                         convert(temp, totalFlows);  
192                         
193                         
194                         temp = validParameter.validFile(parameters, "oligos", true);
195                         if (temp == "not found")        {       oligoFileName = "";             }
196                         else if(temp == "not open")     {       abort = true;                   } 
197                         else                                            {       oligoFileName = temp;   }
198                         
199                         temp = validParameter.validFile(parameters, "fasta", false);            if (temp == "not found"){       fasta = 0;              }
200                         else if(m->isTrue(temp))        {       fasta = 1;      }
201                         
202                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found"){       temp = "9";             }
203                         convert(temp, maxHomoP);  
204
205                         temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
206                         convert(temp, signal);  
207
208                         temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
209                         convert(temp, noise);  
210
211                         temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found"){       temp = "0";             }
212                         convert(temp, minLength); 
213                         
214                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found"){       temp = "0";             }
215                         convert(temp, maxLength);
216                         
217                         temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
218                         convert(temp, bdiffs);
219                         
220                         temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
221                         convert(temp, pdiffs);
222                         
223                         temp = validParameter.validFile(parameters, "tdiffs", false);
224                         if (temp == "not found"){ int tempTotal = pdiffs + bdiffs;  temp = toString(tempTotal); }
225                         convert(temp, tdiffs);
226                         if(tdiffs == 0){        tdiffs = bdiffs + pdiffs;       }
227                         
228                         temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "T";          }
229                         allFiles = m->isTrue(temp);
230                         
231                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
232                         convert(temp, processors); 
233                         
234                         if(oligoFileName == ""){        allFiles = 0;           }
235
236                         numFPrimers = 0;
237                         numRPrimers = 0;
238                 }
239                 
240         }
241         catch(exception& e) {
242                 m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand");
243                 exit(1);
244         }
245 }
246
247 //***************************************************************************************************************
248
249 int TrimFlowsCommand::execute(){
250         try{
251                 
252                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
253
254                 string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow";
255                 outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName);
256                 
257                 string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow";
258                 outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName);
259
260                 string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta";
261                 if(fasta){
262                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
263                 }
264                 
265                 vector<unsigned long int> flowFilePos = getFlowFileBreaks();
266                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
267                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
268                 }       
269
270                 vector<vector<string> > barcodePrimerComboFileNames;
271                 if(oligoFileName != ""){
272                         getOligos(barcodePrimerComboFileNames); 
273                 }
274                 
275 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
276                 if(processors == 1){
277                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
278                 }else{
279                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
280                 }       
281 #else
282                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
283 #endif
284                 
285                 if (m->control_pressed) {  return 0; }                  
286                 
287                 if(allFiles){
288                         
289                         ofstream output;
290                         string flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
291                         m->openOutputFile(flowFilesFileName, output);
292
293                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
294                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
295                                         
296                                         FILE * pFile;
297                                         unsigned long int size;
298                                         
299                                         //get num bytes in file
300                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
301                                         if (pFile==NULL) perror ("Error opening file");
302                                         else{
303                                                 fseek (pFile, 0, SEEK_END);
304                                                 size=ftell (pFile);
305                                                 fclose (pFile);
306                                         }
307
308                                         if(size < 10){
309 //                                              m->mothurOut("deleting: " + barcodePrimerComboFileNames[i][j] + '\n');
310                                                 remove(barcodePrimerComboFileNames[i][j].c_str());
311                                         }
312                                         else{
313 //                                              m->mothurOut("saving: " + barcodePrimerComboFileNames[i][j] + '\n');
314                                                 output << barcodePrimerComboFileNames[i][j] << endl;
315                                         }
316                                 }
317                         }
318                         output.close();
319                 }
320                 
321                 m->mothurOutEndLine();
322                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
323                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
324                 m->mothurOutEndLine();
325                 
326                 return 0;       
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "TrimSeqsCommand", "execute");
330                 exit(1);
331         }
332 }
333
334 //***************************************************************************************************************
335
336 int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
337         
338         try {
339                 
340                 ofstream trimFlowFile;
341                 m->openOutputFile(trimFlowFileName, trimFlowFile);
342                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
343
344                 ofstream scrapFlowFile;
345                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
346                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
347
348                 if(line->start == 4){
349                         scrapFlowFile << totalFlows << endl;
350                         trimFlowFile << totalFlows << endl;
351                         for(int i=0;i<barcodePrimerComboFileNames.size();i++){
352                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
353                                         //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
354                                         ofstream temp;
355                                         m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
356                                         temp << totalFlows << endl;
357                                         temp.close();
358                                 }
359                         }                       
360                 }
361                 
362                 FlowData flowData(numFlows, signal, noise, maxHomoP);
363                 
364                 ofstream fastaFile;
365                 if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
366                 
367                 ifstream flowFile;
368                 m->openInputFile(flowFileName, flowFile);
369                 
370                 flowFile.seekg(line->start);
371
372                 int count = 0;
373                 bool moreSeqs = 1;
374                 
375                 while(moreSeqs) {
376                         
377                         int success = 1;
378                         int currentSeqDiffs = 0;
379                         string trashCode = "";
380                         
381                         flowData.getNext(flowFile);
382                         flowData.capFlows(totalFlows);  
383                         
384                         Sequence currSeq = flowData.getSequence();
385                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
386                                 success = 0;
387                                 trashCode += 'l';
388                         }
389                         
390                         if(minLength > 0 || maxLength > 0){     //screen to see if sequence is above and below a specific number of bases
391                                 int seqLength = currSeq.getNumBases();
392                                 if(seqLength < minLength || seqLength > maxLength){
393                                         success = 0;
394                                         trashCode += 'l';
395                                 }
396                         }
397                         
398                         int primerIndex, barcodeIndex;
399                         
400                         if(barcodes.size() != 0){
401                                 success = stripBarcode(currSeq, barcodeIndex);
402                                 if(success > bdiffs)            {       trashCode += 'b';       }
403                                 else{ currentSeqDiffs += success;  }
404                         }
405                         
406                         if(numFPrimers != 0){
407                                 success = stripForward(currSeq, primerIndex);
408                                 if(success > pdiffs)            {       trashCode += 'f';       }
409                                 else{ currentSeqDiffs += success;  }
410                         }
411                         
412                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
413                         
414                         if(numRPrimers != 0){
415                                 success = stripReverse(currSeq);
416                                 if(!success)                            {       trashCode += 'r';       }
417                         }
418                                                 
419                                 
420                         if(trashCode.length() == 0){
421                                 flowData.printFlows(trimFlowFile);
422                                 
423                                 if(fasta){
424                                         currSeq.printSequence(fastaFile);
425                                 }
426                                 
427                                 if(barcodes.size() != 0 || primers.size() != 0){
428 //                                      string fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + barcodePrimerCombos[barcodeIndex][primerIndex] + ".flow";
429                                         ofstream output;
430                                         m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
431                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
432                                         
433                                         flowData.printFlows(output);
434                                         output.close();
435                                 }
436                                 
437                         }
438                         else{
439                                 flowData.printFlows(scrapFlowFile, trashCode);
440                         }
441                                 
442                         count++;
443                                                 
444                         //report progress
445                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
446
447 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
448                         unsigned long int pos = flowFile.tellg();
449
450                         if ((pos == -1) || (pos >= line->end)) { break; }
451 #else
452                         if (flowFile.eof()) { break; }
453 #endif
454                         
455                 }
456                 //report progress
457                 if((count) % 10000 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
458                 
459                 trimFlowFile.close();
460                 scrapFlowFile.close();
461                 flowFile.close();
462                 if(fasta){      fastaFile.close();      }
463                 
464                 return count;
465         }
466         catch(exception& e) {
467                 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
468                 exit(1);
469         }
470 }
471
472 //***************************************************************************************************************
473
474 void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
475         try {
476                 ifstream oligosFile;
477                 m->openInputFile(oligoFileName, oligosFile);
478                 
479                 string type, oligo, group;
480
481                 int indexPrimer = 0;
482                 int indexBarcode = 0;
483                 
484                 while(!oligosFile.eof()){
485                 
486                         oligosFile >> type; m->gobble(oligosFile);      //get the first column value of the row - is it a comment or a feature we are interested in?
487
488                         if(type[0] == '#'){     //igore the line because there's a comment
489                                 while (!oligosFile.eof())       {       char c = oligosFile.get(); if (c == 10 || c == 13){     break;  }       } // get rest of line if there's any crap there
490                         }
491                         else{                           //there's a feature we're interested in
492
493                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }                                  //make type case insensitive
494
495                                 oligosFile >> oligo;    //get the DNA sequence for the feature
496
497                                 for(int i=0;i<oligo.length();i++){      //make type case insensitive and change any U's to T's
498                                         oligo[i] = toupper(oligo[i]);
499                                         if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
500                                 }
501
502                                 if(type == "FORWARD"){  //if the feature is a forward primer...
503                                         group = "";
504
505                                         while (!oligosFile.eof())       {       // get rest of line in case there is a primer name = will have the name of the primer
506                                                 char c = oligosFile.get(); 
507                                                 if (c == 10 || c == 13){        break;  }
508                                                 else if (c == 32 || c == 9){;} //space or tab
509                                                 else {  group += c;  }
510                                         } 
511
512                                         //have we seen this primer already?
513                                         map<string, int>::iterator itPrimer = primers.find(oligo);
514                                         if (itPrimer != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
515
516                                         primers[oligo]=indexPrimer; indexPrimer++;
517                                         primerNameVector.push_back(group);
518
519                                 }
520                                 else if(type == "REVERSE"){
521                                         Sequence oligoRC("reverse", oligo);
522                                         oligoRC.reverseComplement();
523                                         revPrimer.push_back(oligoRC.getUnaligned());
524                                 }
525                                 else if(type == "BARCODE"){
526                                         oligosFile >> group;
527
528                                         //check for repeat barcodes
529                                         map<string, int>::iterator itBar = barcodes.find(oligo);
530                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
531
532                                         barcodes[oligo]=indexBarcode; indexBarcode++;
533                                         barcodeNameVector.push_back(group);
534                                 }
535                                 else{
536                                         m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  
537                                 }
538                         }
539
540                         m->gobble(oligosFile);
541                 }
542                 oligosFile.close();
543                 
544                 //add in potential combos       
545                 outFlowFileNames.resize(barcodeNameVector.size());
546                 for(int i=0;i<outFlowFileNames.size();i++){
547                         outFlowFileNames[i].assign(primerNameVector.size(), "");
548                 }
549                 
550                 if(allFiles){
551                         for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
552                                 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
553
554                                         string primerName = primerNameVector[itPrimer->second];
555                                         string barcodeName = barcodeNameVector[itBar->second];
556                                                                                 
557                                         string comboGroupName = "";
558                                         string fileName = "";
559                                         
560                                         if(primerName == ""){
561                                                 comboGroupName = barcodeNameVector[itBar->second];
562                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
563                                         }
564                                         else{
565                                                 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
566                                                 fileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + comboGroupName + ".flow";
567                                         }
568                                         
569                                         outputNames.push_back(fileName);
570                                         outputTypes["flow"].push_back(fileName);
571                                         outFlowFileNames[itBar->second][itPrimer->second] = fileName;
572                                         
573                                         ofstream temp;
574                                         m->openOutputFile(fileName, temp);
575                                         temp.close();
576                                 }
577                         }
578                 }
579                 
580                 numFPrimers = primers.size();
581                 numRPrimers = revPrimer.size();
582                 
583         }
584         catch(exception& e) {
585                 m->errorOut(e, "TrimSeqsCommand", "getOligos");
586                 exit(1);
587         }
588 }
589
590 //***************************************************************************************************************
591
592 int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
593         try {
594                 
595                 string rawSequence = seq.getUnaligned();
596                 int success = bdiffs + 1;       //guilty until proven innocent
597                 
598                 //can you find the barcode
599                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
600                         string oligo = it->first;
601                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
602                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
603                                 break;  
604                         }
605                         
606                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
607                                 group = it->second;
608                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
609                                 
610                                 success = 0;
611                                 break;
612                         }
613                 }
614                 
615                 //if you found the barcode or if you don't want to allow for diffs
616                 if ((bdiffs == 0) || (success == 0)) { return success;  }
617                 
618                 else { //try aligning and see if you can find it
619                         
620                         int maxLength = 0;
621                         
622                         Alignment* alignment;
623                         if (barcodes.size() > 0) {
624                                 map<string,int>::iterator it=barcodes.begin();
625                                 
626                                 for(it;it!=barcodes.end();it++){
627                                         if(it->first.length() > maxLength){
628                                                 maxLength = it->first.length();
629                                         }
630                                 }
631                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
632                                 
633                         }else{ alignment = NULL; } 
634                         
635                         //can you find the barcode
636                         int minDiff = 1e6;
637                         int minCount = 1;
638                         int minGroup = -1;
639                         int minPos = 0;
640                         
641                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
642                                 string oligo = it->first;
643                                 //                              int length = oligo.length();
644                                 
645                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
646                                         success = bdiffs + 10;
647                                         break;
648                                 }
649                                 
650                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
651                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
652                                 oligo = alignment->getSeqAAln();
653                                 string temp = alignment->getSeqBAln();
654                                 
655                                 int alnLength = oligo.length();
656                                 
657                                 for(int i=oligo.length()-1;i>=0;i--){
658                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
659                                 }
660                                 oligo = oligo.substr(0,alnLength);
661                                 temp = temp.substr(0,alnLength);
662                                 
663                                 int numDiff = countDiffs(oligo, temp);
664                                 
665                                 if(numDiff < minDiff){
666                                         minDiff = numDiff;
667                                         minCount = 1;
668                                         minGroup = it->second;
669                                         minPos = 0;
670                                         for(int i=0;i<alnLength;i++){
671                                                 if(temp[i] != '-'){
672                                                         minPos++;
673                                                 }
674                                         }
675                                 }
676                                 else if(numDiff == minDiff){
677                                         minCount++;
678                                 }
679                                 
680                         }
681                         
682                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
683                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
684                         else{                                                                                                   //use the best match
685                                 group = minGroup;
686                                 seq.setUnaligned(rawSequence.substr(minPos));
687                                 success = minDiff;
688                         }
689                         
690                         if (alignment != NULL) {  delete alignment;  }
691                         
692                 }
693                 
694                 return success;
695                 
696         }
697         catch(exception& e) {
698                 m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
699                 exit(1);
700         }
701         
702 }
703
704 //***************************************************************************************************************
705
706 int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
707         try {
708                 
709                 string rawSequence = seq.getUnaligned();
710                 int success = pdiffs + 1;       //guilty until proven innocent
711                 
712                 //can you find the primer
713                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
714                         string oligo = it->first;
715                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
716                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
717                                 break;  
718                         }
719                         
720                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
721                                 group = it->second;
722                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
723                                 success = 0;
724                                 break;
725                         }
726                 }
727                 
728                 //if you found the barcode or if you don't want to allow for diffs
729                 if ((pdiffs == 0) || (success == 0)) { return success;  }
730                 
731                 else { //try aligning and see if you can find it
732                         
733                         int maxLength = 0;
734                         
735                         Alignment* alignment;
736                         if (primers.size() > 0) {
737                                 map<string,int>::iterator it=primers.begin();
738                                 
739                                 for(it;it!=primers.end();it++){
740                                         if(it->first.length() > maxLength){
741                                                 maxLength = it->first.length();
742                                         }
743                                 }
744                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
745                                 
746                         }else{ alignment = NULL; } 
747                         
748                         //can you find the barcode
749                         int minDiff = 1e6;
750                         int minCount = 1;
751                         int minGroup = -1;
752                         int minPos = 0;
753                         
754                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
755                                 string oligo = it->first;
756                                 //                              int length = oligo.length();
757                                 
758                                 if(rawSequence.length() < maxLength){   
759                                         success = pdiffs + 100;
760                                         break;
761                                 }
762                                 
763                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
764                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
765                                 oligo = alignment->getSeqAAln();
766                                 string temp = alignment->getSeqBAln();
767                                 
768                                 int alnLength = oligo.length();
769                                 
770                                 for(int i=oligo.length()-1;i>=0;i--){
771                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
772                                 }
773                                 oligo = oligo.substr(0,alnLength);
774                                 temp = temp.substr(0,alnLength);
775                                 
776                                 int numDiff = countDiffs(oligo, temp);
777                                 
778                                 if(numDiff < minDiff){
779                                         minDiff = numDiff;
780                                         minCount = 1;
781                                         minGroup = it->second;
782                                         minPos = 0;
783                                         for(int i=0;i<alnLength;i++){
784                                                 if(temp[i] != '-'){
785                                                         minPos++;
786                                                 }
787                                         }
788                                 }
789                                 else if(numDiff == minDiff){
790                                         minCount++;
791                                 }
792                                 
793                         }
794                         
795                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
796                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
797                         else{                                                                                                   //use the best match
798                                 group = minGroup;
799                                 seq.setUnaligned(rawSequence.substr(minPos));
800                                 success = minDiff;
801                         }
802                         
803                         if (alignment != NULL) {  delete alignment;  }
804                         
805                 }
806                 
807                 return success;
808                 
809         }
810         catch(exception& e) {
811                 m->errorOut(e, "TrimFlowsCommand", "stripForward");
812                 exit(1);
813         }
814 }
815
816 //***************************************************************************************************************
817
818 bool TrimFlowsCommand::stripReverse(Sequence& seq){
819         try {
820
821                 string rawSequence = seq.getUnaligned();
822                 bool success = 0;       //guilty until proven innocent
823                 
824                 for(int i=0;i<numRPrimers;i++){
825                         string oligo = revPrimer[i];
826                         
827                         if(rawSequence.length() < oligo.length()){
828                                 success = 0;
829                                 break;
830                         }
831                         
832                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
833                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
834                                 success = 1;
835                                 break;
836                         }
837                 }       
838
839                 return success;
840                 
841         }
842         catch(exception& e) {
843                 m->errorOut(e, "TrimFlowsCommand", "stripReverse");
844                 exit(1);
845         }
846 }
847
848
849 //***************************************************************************************************************
850
851 bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
852         try {
853                 bool success = 1;
854                 int length = oligo.length();
855                 
856                 for(int i=0;i<length;i++){
857                         
858                         if(oligo[i] != seq[i]){
859                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
860                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
861                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
862                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
863                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
864                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
865                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
866                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
867                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
868                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
869                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
870                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
871                                 
872                                 if(success == 0)        {       break;   }
873                         }
874                         else{
875                                 success = 1;
876                         }
877                 }
878                 
879                 return success;
880         }
881         catch(exception& e) {
882                 m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
883                 exit(1);
884         }
885         
886 }
887
888 //***************************************************************************************************************
889
890 int TrimFlowsCommand::countDiffs(string oligo, string seq){
891         try {
892                 
893                 int length = oligo.length();
894                 int countDiffs = 0;
895                 
896                 for(int i=0;i<length;i++){
897                         
898                         if(oligo[i] != seq[i]){
899                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
900                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
901                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
902                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
903                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
904                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
905                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
906                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
907                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
908                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
909                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
910                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
911                         }
912                         
913                 }
914                 
915                 return countDiffs;
916         }
917         catch(exception& e) {
918                 m->errorOut(e, "TrimFlowsCommand", "countDiffs");
919                 exit(1);
920         }
921         
922 }
923
924 /**************************************************************************************************/
925
926 vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
927
928         try{
929                         
930                 vector<unsigned long int> filePos;
931                 filePos.push_back(0);
932                                         
933                 FILE * pFile;
934                 unsigned long int size;
935                 
936                 //get num bytes in file
937                 pFile = fopen (flowFileName.c_str(),"rb");
938                 if (pFile==NULL) perror ("Error opening file");
939                 else{
940                         fseek (pFile, 0, SEEK_END);
941                         size=ftell (pFile);
942                         fclose (pFile);
943                 }
944                                 
945                 //estimate file breaks
946                 unsigned long int chunkSize = 0;
947                 chunkSize = size / processors;
948
949                 //file too small to divide by processors
950                 if (chunkSize == 0)  {  processors = 1; filePos.push_back(size); return filePos;        }
951                 
952                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
953                 for (int i = 0; i < processors; i++) {
954                         unsigned long int spot = (i+1) * chunkSize;
955                         
956                         ifstream in;
957                         m->openInputFile(flowFileName, in);
958                         in.seekg(spot);
959                         
960                         string dummy = m->getline(in);
961                         
962                         //there was not another sequence before the end of the file
963                         unsigned long int sanityPos = in.tellg();
964                         
965 //                      if (sanityPos == -1) {  break;  }
966 //                      else {  filePos.push_back(newSpot);  }
967                         if (sanityPos == -1) {  break;  }
968                         else {  filePos.push_back(sanityPos);  }
969                         
970                         in.close();
971                 }
972                 
973                 //save end pos
974                 filePos.push_back(size);
975                 
976                 //sanity check filePos
977                 for (int i = 0; i < (filePos.size()-1); i++) {
978                         if (filePos[(i+1)] <= filePos[i]) {  filePos.erase(filePos.begin()+(i+1)); i--; }
979                 }
980
981                 ifstream in;
982                 m->openInputFile(flowFileName, in);
983                 in >> numFlows;
984                 m->gobble(in);
985                 unsigned long int spot = in.tellg();
986                 filePos[0] = spot;
987                 in.close();
988                 
989                 processors = (filePos.size() - 1);
990                 
991                 return filePos; 
992         }
993         catch(exception& e) {
994                 m->errorOut(e, "TrimSeqsCommand", "getFlowFileBreaks");
995                 exit(1);
996         }
997 }
998
999 /**************************************************************************************************/
1000
1001 int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
1002
1003         try {
1004 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1005                 int process = 1;
1006                 int exitCommand = 1;
1007                 processIDS.clear();
1008                 
1009                 //loop through and create all the processes you want
1010                 while (process != processors) {
1011                         int pid = fork();
1012                         
1013                         if (pid > 0) {
1014                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1015                                 process++;
1016                         }else if (pid == 0){
1017                                 
1018                                 vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
1019                                 for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
1020                                         for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
1021                                                 tempBarcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
1022                                                 ofstream temp;
1023                                                 m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
1024                                                 temp.close();
1025                                                 
1026                                         }
1027                                 }
1028                                                 
1029                                 driverCreateTrim(flowFileName,
1030                                                                  (trimFlowFileName + toString(getpid()) + ".temp"),
1031                                                                  (scrapFlowFileName + toString(getpid()) + ".temp"),
1032                                                                  (fastaFileName + toString(getpid()) + ".temp"),
1033                                                                  tempBarcodePrimerComboFileNames, lines[process]);
1034
1035                                 exit(0);
1036                         }else { 
1037                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1038                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1039                                 exit(0);
1040                         }
1041                 }
1042                 
1043                 //parent do my part
1044                 ofstream temp;
1045                 m->openOutputFile(trimFlowFileName, temp);
1046                 temp.close();
1047
1048                 m->openOutputFile(scrapFlowFileName, temp);
1049                 temp.close();
1050                 
1051                 if(fasta){
1052                         m->openOutputFile(fastaFileName, temp);
1053                         temp.close();
1054                 }
1055                 
1056                 driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
1057
1058                 //force parent to wait until all the processes are done
1059                 for (int i=0;i<processIDS.size();i++) { 
1060                         int temp = processIDS[i];
1061                         wait(&temp);
1062                 }
1063                 
1064                 //append files
1065                 m->mothurOutEndLine();
1066                 for(int i=0;i<processIDS.size();i++){
1067                         
1068                         m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1069                         
1070                         m->appendFiles((trimFlowFileName + toString(processIDS[i]) + ".temp"), trimFlowFileName);
1071                         remove((trimFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1072 //                      m->mothurOut("\tDone with trim.flow file"); m->mothurOutEndLine();
1073
1074                         m->appendFiles((scrapFlowFileName + toString(processIDS[i]) + ".temp"), scrapFlowFileName);
1075                         remove((scrapFlowFileName + toString(processIDS[i]) + ".temp").c_str());
1076 //                      m->mothurOut("\tDone with scrap.flow file"); m->mothurOutEndLine();
1077
1078                         if(fasta){
1079                                 m->appendFiles((fastaFileName + toString(processIDS[i]) + ".temp"), fastaFileName);
1080                                 remove((fastaFileName + toString(processIDS[i]) + ".temp").c_str());
1081 //                              m->mothurOut("\tDone with flow.fasta file"); m->mothurOutEndLine();
1082                         }
1083                         if(allFiles){                                           
1084                                 for (int j = 0; j < barcodePrimerComboFileNames.size(); j++) {
1085                                         for (int k = 0; k < barcodePrimerComboFileNames[0].size(); k++) {
1086                                                 m->appendFiles((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp"), barcodePrimerComboFileNames[j][k]);
1087                                                 remove((barcodePrimerComboFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
1088                                         }
1089                                 }
1090                         }
1091                 }
1092                 
1093                 return exitCommand;
1094 #endif          
1095         }
1096         catch(exception& e) {
1097                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
1098                 exit(1);
1099         }
1100 }
1101
1102 //***************************************************************************************************************